未验证 提交 101e20e9 编写于 作者: G Guo Sheng 提交者: GitHub

Update some en api docs (#20496)

* Fix api docs. test=document-fix

* Fix en docs. test=develop

* Fix the doc of dynamic_gru. test=document_fix

* Update API.spec. test=document_fix

* Fix codestyle test=develop, test=document_fix
上级 200cc5e2
...@@ -127,15 +127,15 @@ paddle.fluid.layers.center_loss (ArgSpec(args=['input', 'label', 'num_classes', ...@@ -127,15 +127,15 @@ paddle.fluid.layers.center_loss (ArgSpec(args=['input', 'label', 'num_classes',
paddle.fluid.layers.embedding (ArgSpec(args=['input', 'size', 'is_sparse', 'is_distributed', 'padding_idx', 'param_attr', 'dtype'], varargs=None, keywords=None, defaults=(False, False, None, None, 'float32')), ('document', 'c51fcac7a4f5786ca41f27fa60bd22c5')) paddle.fluid.layers.embedding (ArgSpec(args=['input', 'size', 'is_sparse', 'is_distributed', 'padding_idx', 'param_attr', 'dtype'], varargs=None, keywords=None, defaults=(False, False, None, None, 'float32')), ('document', 'c51fcac7a4f5786ca41f27fa60bd22c5'))
paddle.fluid.layers.dynamic_lstm (ArgSpec(args=['input', 'size', 'h_0', 'c_0', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, None, None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'float32', None)), ('document', 'd4a82e2f5feb20c4a23ced8054e047ed')) paddle.fluid.layers.dynamic_lstm (ArgSpec(args=['input', 'size', 'h_0', 'c_0', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, None, None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'float32', None)), ('document', 'd4a82e2f5feb20c4a23ced8054e047ed'))
paddle.fluid.layers.dynamic_lstmp (ArgSpec(args=['input', 'size', 'proj_size', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'proj_activation', 'dtype', 'name', 'h_0', 'c_0', 'cell_clip', 'proj_clip'], varargs=None, keywords=None, defaults=(None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'tanh', 'float32', None, None, None, None, None)), ('document', 'b35fe3e0c2ecca15a8be658277e064ec')) paddle.fluid.layers.dynamic_lstmp (ArgSpec(args=['input', 'size', 'proj_size', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'proj_activation', 'dtype', 'name', 'h_0', 'c_0', 'cell_clip', 'proj_clip'], varargs=None, keywords=None, defaults=(None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'tanh', 'float32', None, None, None, None, None)), ('document', 'b35fe3e0c2ecca15a8be658277e064ec'))
paddle.fluid.layers.dynamic_gru (ArgSpec(args=['input', 'size', 'param_attr', 'bias_attr', 'is_reverse', 'gate_activation', 'candidate_activation', 'h_0', 'origin_mode'], varargs=None, keywords=None, defaults=(None, None, False, 'sigmoid', 'tanh', None, False)), ('document', '83617c165827e030636c80486d5de6f3')) paddle.fluid.layers.dynamic_gru (ArgSpec(args=['input', 'size', 'param_attr', 'bias_attr', 'is_reverse', 'gate_activation', 'candidate_activation', 'h_0', 'origin_mode'], varargs=None, keywords=None, defaults=(None, None, False, 'sigmoid', 'tanh', None, False)), ('document', 'a3364b36fb3190b9bd75e419aa75573b'))
paddle.fluid.layers.gru_unit (ArgSpec(args=['input', 'hidden', 'size', 'param_attr', 'bias_attr', 'activation', 'gate_activation', 'origin_mode'], varargs=None, keywords=None, defaults=(None, None, 'tanh', 'sigmoid', False)), ('document', '33974b9bfa69f2f1eb85e6f956dff04e')) paddle.fluid.layers.gru_unit (ArgSpec(args=['input', 'hidden', 'size', 'param_attr', 'bias_attr', 'activation', 'gate_activation', 'origin_mode'], varargs=None, keywords=None, defaults=(None, None, 'tanh', 'sigmoid', False)), ('document', '0b10a755b469d0b85b3a5cac38b4cf01'))
paddle.fluid.layers.linear_chain_crf (ArgSpec(args=['input', 'label', 'param_attr', 'length'], varargs=None, keywords=None, defaults=(None, None)), ('document', 'b28bdb43160e9667be2a3457d19d9f5b')) paddle.fluid.layers.linear_chain_crf (ArgSpec(args=['input', 'label', 'param_attr', 'length'], varargs=None, keywords=None, defaults=(None, None)), ('document', 'b28bdb43160e9667be2a3457d19d9f5b'))
paddle.fluid.layers.crf_decoding (ArgSpec(args=['input', 'param_attr', 'label', 'length'], varargs=None, keywords=None, defaults=(None, None)), ('document', '708ce0348b74d3e0c7885c2c524b7fa7')) paddle.fluid.layers.crf_decoding (ArgSpec(args=['input', 'param_attr', 'label', 'length'], varargs=None, keywords=None, defaults=(None, None)), ('document', '708ce0348b74d3e0c7885c2c524b7fa7'))
paddle.fluid.layers.cos_sim (ArgSpec(args=['X', 'Y'], varargs=None, keywords=None, defaults=None), ('document', '48ec1ba2d75c4e2faf8d9a47350462ae')) paddle.fluid.layers.cos_sim (ArgSpec(args=['X', 'Y'], varargs=None, keywords=None, defaults=None), ('document', '48ec1ba2d75c4e2faf8d9a47350462ae'))
paddle.fluid.layers.cross_entropy (ArgSpec(args=['input', 'label', 'soft_label', 'ignore_index'], varargs=None, keywords=None, defaults=(False, -100)), ('document', 'd1985a930a59c3bd41a7c1d72594f5b9')) paddle.fluid.layers.cross_entropy (ArgSpec(args=['input', 'label', 'soft_label', 'ignore_index'], varargs=None, keywords=None, defaults=(False, -100)), ('document', 'd1985a930a59c3bd41a7c1d72594f5b9'))
paddle.fluid.layers.bpr_loss (ArgSpec(args=['input', 'label', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'ae57e6e5136dade436f0df1f11770afa')) paddle.fluid.layers.bpr_loss (ArgSpec(args=['input', 'label', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'ae57e6e5136dade436f0df1f11770afa'))
paddle.fluid.layers.square_error_cost (ArgSpec(args=['input', 'label'], varargs=None, keywords=None, defaults=None), ('document', '4ed09e115b50ec7393674c4c09d223a2')) paddle.fluid.layers.square_error_cost (ArgSpec(args=['input', 'label'], varargs=None, keywords=None, defaults=None), ('document', '4ed09e115b50ec7393674c4c09d223a2'))
paddle.fluid.layers.chunk_eval (ArgSpec(args=['input', 'label', 'chunk_scheme', 'num_chunk_types', 'excluded_chunk_types', 'seq_length'], varargs=None, keywords=None, defaults=(None, None)), ('document', 'b02844e0ad4bd713c5fe6802aa13219c')) paddle.fluid.layers.chunk_eval (ArgSpec(args=['input', 'label', 'chunk_scheme', 'num_chunk_types', 'excluded_chunk_types', 'seq_length'], varargs=None, keywords=None, defaults=(None, None)), ('document', 'a8aa2071cae18df1e8dde9183d64bfb1'))
paddle.fluid.layers.sequence_conv (ArgSpec(args=['input', 'num_filters', 'filter_size', 'filter_stride', 'padding', 'padding_start', 'bias_attr', 'param_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(3, 1, True, None, None, None, None, None)), ('document', 'ebddcc5a1073ef065d22b4673e36b1d2')) paddle.fluid.layers.sequence_conv (ArgSpec(args=['input', 'num_filters', 'filter_size', 'filter_stride', 'padding', 'padding_start', 'bias_attr', 'param_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(3, 1, True, None, None, None, None, None)), ('document', 'ebddcc5a1073ef065d22b4673e36b1d2'))
paddle.fluid.layers.conv2d (ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name', 'data_format'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, None, None, 'NCHW')), ('document', 'e91c63b8ac8c35982c0ac518537e44bf')) paddle.fluid.layers.conv2d (ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name', 'data_format'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, None, None, 'NCHW')), ('document', 'e91c63b8ac8c35982c0ac518537e44bf'))
paddle.fluid.layers.conv3d (ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name', 'data_format'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, None, None, 'NCDHW')), ('document', 'feff9c8ebb4d4d0be5345f9042f57c8e')) paddle.fluid.layers.conv3d (ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name', 'data_format'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, None, None, 'NCDHW')), ('document', 'feff9c8ebb4d4d0be5345f9042f57c8e'))
...@@ -149,14 +149,14 @@ paddle.fluid.layers.adaptive_pool3d (ArgSpec(args=['input', 'pool_size', 'pool_t ...@@ -149,14 +149,14 @@ paddle.fluid.layers.adaptive_pool3d (ArgSpec(args=['input', 'pool_size', 'pool_t
paddle.fluid.layers.batch_norm (ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu', 'use_global_stats'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False, False)), ('document', '1400433bae7876d0407ae205be39b7a1')) paddle.fluid.layers.batch_norm (ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu', 'use_global_stats'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False, False)), ('document', '1400433bae7876d0407ae205be39b7a1'))
paddle.fluid.layers.instance_norm (ArgSpec(args=['input', 'epsilon', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(1e-05, None, None, None)), ('document', '23d6fba8ad8495f67a66d8878be5b0be')) paddle.fluid.layers.instance_norm (ArgSpec(args=['input', 'epsilon', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(1e-05, None, None, None)), ('document', '23d6fba8ad8495f67a66d8878be5b0be'))
paddle.fluid.layers.data_norm (ArgSpec(args=['input', 'act', 'epsilon', 'param_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var'], varargs=None, keywords=None, defaults=(None, 1e-05, None, 'NCHW', False, None, None, None, False)), ('document', '5ba4cdb4ea5c03382da545335ffc05b7')) paddle.fluid.layers.data_norm (ArgSpec(args=['input', 'act', 'epsilon', 'param_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var'], varargs=None, keywords=None, defaults=(None, 1e-05, None, 'NCHW', False, None, None, None, False)), ('document', '5ba4cdb4ea5c03382da545335ffc05b7'))
paddle.fluid.layers.beam_search_decode (ArgSpec(args=['ids', 'scores', 'beam_size', 'end_id', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '83e08f21af41ac8bac37aeab1f86fdd0')) paddle.fluid.layers.beam_search_decode (ArgSpec(args=['ids', 'scores', 'beam_size', 'end_id', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'eafa177a7fed6178a51c1affa7f46a40'))
paddle.fluid.layers.conv2d_transpose (ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name', 'data_format'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None, 'NCHW')), ('document', 'ed24c2d0f82cd9a3b40488157285a584')) paddle.fluid.layers.conv2d_transpose (ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name', 'data_format'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None, 'NCHW')), ('document', 'ed24c2d0f82cd9a3b40488157285a584'))
paddle.fluid.layers.conv3d_transpose (ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name', 'data_format'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None, 'NCDHW')), ('document', 'efb1e3bc87339cb26faa2edae210e8b0')) paddle.fluid.layers.conv3d_transpose (ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name', 'data_format'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None, 'NCDHW')), ('document', 'efb1e3bc87339cb26faa2edae210e8b0'))
paddle.fluid.layers.sequence_expand (ArgSpec(args=['x', 'y', 'ref_level', 'name'], varargs=None, keywords=None, defaults=(-1, None)), ('document', '10e122eb755c2bd1f78ef2332b28f1a0')) paddle.fluid.layers.sequence_expand (ArgSpec(args=['x', 'y', 'ref_level', 'name'], varargs=None, keywords=None, defaults=(-1, None)), ('document', '10e122eb755c2bd1f78ef2332b28f1a0'))
paddle.fluid.layers.sequence_expand_as (ArgSpec(args=['x', 'y', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '858c432e7cbd8bb952cc2eb555457d50')) paddle.fluid.layers.sequence_expand_as (ArgSpec(args=['x', 'y', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '858c432e7cbd8bb952cc2eb555457d50'))
paddle.fluid.layers.sequence_pad (ArgSpec(args=['x', 'pad_value', 'maxlen', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', 'df08b9c499ab3a90f95d08ab5b6c6c62')) paddle.fluid.layers.sequence_pad (ArgSpec(args=['x', 'pad_value', 'maxlen', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', 'df08b9c499ab3a90f95d08ab5b6c6c62'))
paddle.fluid.layers.sequence_unpad (ArgSpec(args=['x', 'length', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'e478180d5bc010a84f35af958cafa62c')) paddle.fluid.layers.sequence_unpad (ArgSpec(args=['x', 'length', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'e478180d5bc010a84f35af958cafa62c'))
paddle.fluid.layers.lstm_unit (ArgSpec(args=['x_t', 'hidden_t_prev', 'cell_t_prev', 'forget_bias', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(0.0, None, None, None)), ('document', 'fe126c58e4339410e875ab1eba246d21')) paddle.fluid.layers.lstm_unit (ArgSpec(args=['x_t', 'hidden_t_prev', 'cell_t_prev', 'forget_bias', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(0.0, None, None, None)), ('document', 'f5a878b6166f34878376a58d7e6fa95c'))
paddle.fluid.layers.reduce_sum (ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)), ('document', 'ecb55075fdf89a866bcede85e60aebad')) paddle.fluid.layers.reduce_sum (ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)), ('document', 'ecb55075fdf89a866bcede85e60aebad'))
paddle.fluid.layers.reduce_mean (ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)), ('document', '968c9b17affaf714e5021c3dc8d68c73')) paddle.fluid.layers.reduce_mean (ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)), ('document', '968c9b17affaf714e5021c3dc8d68c73'))
paddle.fluid.layers.reduce_max (ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)), ('document', 'd37e3a9a05c00e032d4b7876c4f6b414')) paddle.fluid.layers.reduce_max (ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)), ('document', 'd37e3a9a05c00e032d4b7876c4f6b414'))
...@@ -181,7 +181,7 @@ paddle.fluid.layers.im2sequence (ArgSpec(args=['input', 'filter_size', 'stride', ...@@ -181,7 +181,7 @@ paddle.fluid.layers.im2sequence (ArgSpec(args=['input', 'filter_size', 'stride',
paddle.fluid.layers.nce (ArgSpec(args=['input', 'label', 'num_total_classes', 'sample_weight', 'param_attr', 'bias_attr', 'num_neg_samples', 'name', 'sampler', 'custom_dist', 'seed', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 'uniform', None, 0, False)), ('document', '38297567127888e01542857839058d52')) paddle.fluid.layers.nce (ArgSpec(args=['input', 'label', 'num_total_classes', 'sample_weight', 'param_attr', 'bias_attr', 'num_neg_samples', 'name', 'sampler', 'custom_dist', 'seed', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 'uniform', None, 0, False)), ('document', '38297567127888e01542857839058d52'))
paddle.fluid.layers.sampled_softmax_with_cross_entropy (ArgSpec(args=['logits', 'label', 'num_samples', 'num_true', 'remove_accidental_hits', 'use_customized_samples', 'customized_samples', 'customized_probabilities', 'seed'], varargs=None, keywords=None, defaults=(1, True, False, None, None, 0)), ('document', 'd4435a63d34203339831ee6a86ef9242')) paddle.fluid.layers.sampled_softmax_with_cross_entropy (ArgSpec(args=['logits', 'label', 'num_samples', 'num_true', 'remove_accidental_hits', 'use_customized_samples', 'customized_samples', 'customized_probabilities', 'seed'], varargs=None, keywords=None, defaults=(1, True, False, None, None, 0)), ('document', 'd4435a63d34203339831ee6a86ef9242'))
paddle.fluid.layers.hsigmoid (ArgSpec(args=['input', 'label', 'num_classes', 'param_attr', 'bias_attr', 'name', 'path_table', 'path_code', 'is_custom', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, False, False)), ('document', 'b83e7dfa81059b39bb137922dc914f50')) paddle.fluid.layers.hsigmoid (ArgSpec(args=['input', 'label', 'num_classes', 'param_attr', 'bias_attr', 'name', 'path_table', 'path_code', 'is_custom', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, False, False)), ('document', 'b83e7dfa81059b39bb137922dc914f50'))
paddle.fluid.layers.beam_search (ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 'scores', 'beam_size', 'end_id', 'level', 'is_accumulated', 'name', 'return_parent_idx'], varargs=None, keywords=None, defaults=(0, True, None, False)), ('document', '1270395ce97a4e1b556104abbb14f096')) paddle.fluid.layers.beam_search (ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 'scores', 'beam_size', 'end_id', 'level', 'is_accumulated', 'name', 'return_parent_idx'], varargs=None, keywords=None, defaults=(0, True, None, False)), ('document', '2b505ddaa309fd7b9be5445e41ca76d5'))
paddle.fluid.layers.row_conv (ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None)), ('document', 'a6477957b44907787b3c74157400b80c')) paddle.fluid.layers.row_conv (ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None)), ('document', 'a6477957b44907787b3c74157400b80c'))
paddle.fluid.layers.multiplex (ArgSpec(args=['inputs', 'index'], varargs=None, keywords=None, defaults=None), ('document', '2c4d1ae83da6ed35e3b36ba1b3b51d23')) paddle.fluid.layers.multiplex (ArgSpec(args=['inputs', 'index'], varargs=None, keywords=None, defaults=None), ('document', '2c4d1ae83da6ed35e3b36ba1b3b51d23'))
paddle.fluid.layers.layer_norm (ArgSpec(args=['input', 'scale', 'shift', 'begin_norm_axis', 'epsilon', 'param_attr', 'bias_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(True, True, 1, 1e-05, None, None, None, None)), ('document', '678de6d6d0c93da74189990b039daae8')) paddle.fluid.layers.layer_norm (ArgSpec(args=['input', 'scale', 'shift', 'begin_norm_axis', 'epsilon', 'param_attr', 'bias_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(True, True, 1, 1e-05, None, None, None, None)), ('document', '678de6d6d0c93da74189990b039daae8'))
...@@ -281,7 +281,7 @@ paddle.fluid.layers.similarity_focus (ArgSpec(args=['input', 'axis', 'indexes', ...@@ -281,7 +281,7 @@ paddle.fluid.layers.similarity_focus (ArgSpec(args=['input', 'axis', 'indexes',
paddle.fluid.layers.hash (ArgSpec(args=['input', 'hash_size', 'num_hash', 'name'], varargs=None, keywords=None, defaults=(1, None)), ('document', 'a0b73c21be618cec0281e7903039e5e3')) paddle.fluid.layers.hash (ArgSpec(args=['input', 'hash_size', 'num_hash', 'name'], varargs=None, keywords=None, defaults=(1, None)), ('document', 'a0b73c21be618cec0281e7903039e5e3'))
paddle.fluid.layers.grid_sampler (ArgSpec(args=['x', 'grid', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '90c74742f48c70b103f1fbb9eb129066')) paddle.fluid.layers.grid_sampler (ArgSpec(args=['x', 'grid', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '90c74742f48c70b103f1fbb9eb129066'))
paddle.fluid.layers.log_loss (ArgSpec(args=['input', 'label', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(0.0001, None)), ('document', 'ef1701e11d60508fe8f02dd2a8c60bdf')) paddle.fluid.layers.log_loss (ArgSpec(args=['input', 'label', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(0.0001, None)), ('document', 'ef1701e11d60508fe8f02dd2a8c60bdf'))
paddle.fluid.layers.add_position_encoding (ArgSpec(args=['input', 'alpha', 'beta', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'e399f9436fed5f7ff480d8532e42c937')) paddle.fluid.layers.add_position_encoding (ArgSpec(args=['input', 'alpha', 'beta', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'bd8b28e6c1640b13a42b0524f86f7800'))
paddle.fluid.layers.bilinear_tensor_product (ArgSpec(args=['x', 'y', 'size', 'act', 'name', 'param_attr', 'bias_attr'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', '6755168c4b2308e1e4f54cb56fa7dcb2')) paddle.fluid.layers.bilinear_tensor_product (ArgSpec(args=['x', 'y', 'size', 'act', 'name', 'param_attr', 'bias_attr'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', '6755168c4b2308e1e4f54cb56fa7dcb2'))
paddle.fluid.layers.merge_selected_rows (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'b2b0e5d5c155ce24bafc38b78cd0b164')) paddle.fluid.layers.merge_selected_rows (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'b2b0e5d5c155ce24bafc38b78cd0b164'))
paddle.fluid.layers.get_tensor_from_selected_rows (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '2c568321feb4d16c41a83df43f95089d')) paddle.fluid.layers.get_tensor_from_selected_rows (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '2c568321feb4d16c41a83df43f95089d'))
...@@ -922,7 +922,7 @@ paddle.fluid.transpiler.DistributeTranspilerConfig.__init__ (ArgSpec(args=['self ...@@ -922,7 +922,7 @@ paddle.fluid.transpiler.DistributeTranspilerConfig.__init__ (ArgSpec(args=['self
paddle.fluid.nets.simple_img_conv_pool (ArgSpec(args=['input', 'num_filters', 'filter_size', 'pool_size', 'pool_stride', 'pool_padding', 'pool_type', 'global_pooling', 'conv_stride', 'conv_padding', 'conv_dilation', 'conv_groups', 'param_attr', 'bias_attr', 'act', 'use_cudnn'], varargs=None, keywords=None, defaults=(0, 'max', False, 1, 0, 1, 1, None, None, None, True)), ('document', '5e89c978199c4ecce2b26d5fed1ec52b')) paddle.fluid.nets.simple_img_conv_pool (ArgSpec(args=['input', 'num_filters', 'filter_size', 'pool_size', 'pool_stride', 'pool_padding', 'pool_type', 'global_pooling', 'conv_stride', 'conv_padding', 'conv_dilation', 'conv_groups', 'param_attr', 'bias_attr', 'act', 'use_cudnn'], varargs=None, keywords=None, defaults=(0, 'max', False, 1, 0, 1, 1, None, None, None, True)), ('document', '5e89c978199c4ecce2b26d5fed1ec52b'))
paddle.fluid.nets.sequence_conv_pool (ArgSpec(args=['input', 'num_filters', 'filter_size', 'param_attr', 'act', 'pool_type', 'bias_attr'], varargs=None, keywords=None, defaults=(None, 'sigmoid', 'max', None)), ('document', 'b2d435f782ac8ea3ca480b8d24e7f5b4')) paddle.fluid.nets.sequence_conv_pool (ArgSpec(args=['input', 'num_filters', 'filter_size', 'param_attr', 'act', 'pool_type', 'bias_attr'], varargs=None, keywords=None, defaults=(None, 'sigmoid', 'max', None)), ('document', 'b2d435f782ac8ea3ca480b8d24e7f5b4'))
paddle.fluid.nets.glu (ArgSpec(args=['input', 'dim'], varargs=None, keywords=None, defaults=(-1,)), ('document', '3efe197c8e3e75f84a4c464d8b74e943')) paddle.fluid.nets.glu (ArgSpec(args=['input', 'dim'], varargs=None, keywords=None, defaults=(-1,)), ('document', '3efe197c8e3e75f84a4c464d8b74e943'))
paddle.fluid.nets.scaled_dot_product_attention (ArgSpec(args=['queries', 'keys', 'values', 'num_heads', 'dropout_rate'], varargs=None, keywords=None, defaults=(1, 0.0)), ('document', 'b1a07a0000eb9103e3a143ca8c13de5b')) paddle.fluid.nets.scaled_dot_product_attention (ArgSpec(args=['queries', 'keys', 'values', 'num_heads', 'dropout_rate'], varargs=None, keywords=None, defaults=(1, 0.0)), ('document', '375898e47266633635f4c2096e1ac296'))
paddle.fluid.nets.img_conv_group (ArgSpec(args=['input', 'conv_num_filter', 'pool_size', 'conv_padding', 'conv_filter_size', 'conv_act', 'param_attr', 'conv_with_batchnorm', 'conv_batchnorm_drop_rate', 'pool_stride', 'pool_type', 'use_cudnn'], varargs=None, keywords=None, defaults=(1, 3, None, None, False, 0.0, 1, 'max', True)), ('document', 'a59c581d5969266427e841abe69f694a')) paddle.fluid.nets.img_conv_group (ArgSpec(args=['input', 'conv_num_filter', 'pool_size', 'conv_padding', 'conv_filter_size', 'conv_act', 'param_attr', 'conv_with_batchnorm', 'conv_batchnorm_drop_rate', 'pool_stride', 'pool_type', 'use_cudnn'], varargs=None, keywords=None, defaults=(1, 3, None, None, False, 0.0, 1, 'max', True)), ('document', 'a59c581d5969266427e841abe69f694a'))
paddle.fluid.optimizer.SGDOptimizer ('paddle.fluid.optimizer.SGDOptimizer', ('document', 'fc09d6e6c1083cec2dce51f6f9f4ecaf')) paddle.fluid.optimizer.SGDOptimizer ('paddle.fluid.optimizer.SGDOptimizer', ('document', 'fc09d6e6c1083cec2dce51f6f9f4ecaf'))
paddle.fluid.optimizer.SGDOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'regularization', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.SGDOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'regularization', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
......
...@@ -452,6 +452,7 @@ def center_loss(input, ...@@ -452,6 +452,7 @@ def center_loss(input,
centers_param = helper.create_parameter( centers_param = helper.create_parameter(
attr=param_attr, shape=centers_shape, dtype=dtype) attr=param_attr, shape=centers_shape, dtype=dtype)
centers_param.stop_gradient = True centers_param.stop_gradient = True
if isinstance(alpha, Variable): if isinstance(alpha, Variable):
alpha_param = alpha alpha_param = alpha
else: else:
...@@ -1215,13 +1216,16 @@ def dynamic_gru(input, ...@@ -1215,13 +1216,16 @@ def dynamic_gru(input,
h_0=None, h_0=None,
origin_mode=False): origin_mode=False):
""" """
**Gated Recurrent Unit (GRU) Layer** **Note: The input type of this must be LoDTensor. If the input type to be
processed is Tensor, use** :ref:`api_fluid_layers_StaticRNN` .
if origin_mode is False, then the equation of a gru step is from paper This operator is used to perform the calculations for a single layer of
`Empirical Evaluation of Gated Recurrent Neural Networks on Sequence Gated Recurrent Unit (GRU) on full sequences step by step. The calculations
Modeling <https://arxiv.org/pdf/1412.3555.pdf>`_ . in one time step support these two modes:
The formula is as follows: If ``origin_mode`` is True, then the formula used is from paper
`Learning Phrase Representations using RNN Encoder Decoder for Statistical
Machine Translation <https://arxiv.org/pdf/1406.1078.pdf>`_ .
.. math:: .. math::
...@@ -1231,12 +1235,12 @@ def dynamic_gru(input, ...@@ -1231,12 +1235,12 @@ def dynamic_gru(input,
\\tilde{h_t} & = act_c(W_{cx}x_{t} + W_{ch}(r_t \odot h_{t-1}) + b_c) \\tilde{h_t} & = act_c(W_{cx}x_{t} + W_{ch}(r_t \odot h_{t-1}) + b_c)
h_t & = (1-u_t) \odot h_{t-1} + u_t \odot \\tilde{h_t} h_t & = u_t \odot h_{t-1} + (1-u_t) \odot \\tilde{h_t}
if origin_mode is True then the equation is from paper if ``origin_mode`` is False, then the formula used is from paper
Learning Phrase Representations using RNN Encoder-Decoder for Statistical `Empirical Evaluation of Gated Recurrent Neural Networks on Sequence
Machine Translation <https://arxiv.org/pdf/1406.1078.pdf>`_ Modeling <https://arxiv.org/pdf/1412.3555.pdf>`_
.. math:: .. math::
...@@ -1246,59 +1250,56 @@ def dynamic_gru(input, ...@@ -1246,59 +1250,56 @@ def dynamic_gru(input,
\\tilde{h_t} & = act_c(W_{cx}x_{t} + W_{ch}(r_t \odot h_{t-1}) + b_c) \\tilde{h_t} & = act_c(W_{cx}x_{t} + W_{ch}(r_t \odot h_{t-1}) + b_c)
h_t & = u_t \odot h_{t-1} + (1-u_t) \odot \\tilde{h_t} h_t & = (1-u_t) \odot h_{t-1} + u_t \odot \\tilde{h_t}
The :math:`\odot` is the element-wise product of the vectors. :math:`act_g` :math:`x_t` is the input of current time step, but it is not from ``input`` .
is the update gate and reset gate activation function and :math:`sigmoid` This operator does not include the calculations :math:`W_{ux}x_{t}, W_{rx}x_{t}, W_{cx}x_{t}` ,
is usually used for it. :math:`act_c` is the activation function for **Note** thus a fully-connect layer whose size is 3 times of ``size`` should
candidate hidden state and :math:`tanh` is usually used for it. be used before this operator, and the output should be used as ``input`` here.
:math:`h_{t-1}` is the hidden state from previous time step.
:math:`u_t` , :math:`r_t` , :math:`\\tilde{h_t}` and :math:`h_t` stand for
update gate, reset gate, candidate hidden and hidden output separately.
:math:`W_{uh}, b_u` , :math:`W_{rh}, b_r` and :math:`W_{ch}, b_c` stand for
the weight matrix and bias used in update gate, reset gate, candidate hidden
calculations. For implementation, the three weight matrix are merged into a
tensor shaped :math:`[D, D \\times 3]` , the three bias are concatenated as
a tensor shaped :math:`[1, D \\times 3]` , where :math:`D` stands for the
hidden size; The data layout of weight tensor is: :math:`W_{uh}` and :math:`W_{rh}`
are concatenated with shape :math:`[D, D \\times 2]` lying on the first part,
and :math:`W_{ch}` lying on the latter part with shape :math:`[D, D]` .
Note that these :math:`W_{ux}x_{t}, W_{rx}x_{t}, W_{cx}x_{t}` operations on
the input :math:`x_{t}` are NOT included in this operator. Users can choose
to use fully-connect layer before GRU layer.
Args: Args:
input(Variable): The input of dynamic_gru layer, which supports input(Variable): A LoDTensor whose lod level is 1, representing the input
variable-time length input sequence. The underlying tensor in this after linear projection. Its shape should be :math:`[T, D \\times 3]` ,
Variable is a matrix with shape :math:`(T \\times 3D)`, where where :math:`T` stands for the total sequence lengths in this mini-batch,
:math:`T` is the total time steps in this mini-batch, :math:`D` :math:`D` for the hidden size. The data type should be float32 or float64.
is the hidden size. size(int): Indicate the hidden size.
size(int): The dimension of the gru cell. param_attr(ParamAttr, optional): To specify the weight parameter property.
param_attr(ParamAttr|None): The parameter attribute for the learnable Default: None, which means the default weight parameter property is used.
hidden-hidden weight matrix. Note: See usage for details in :ref:`api_fluid_ParamAttr` .
bias_attr (ParamAttr, optional): To specify the bias parameter property.
- The shape of the weight matrix is :math:`(T \\times 3D)`, where Default: None, which means the default bias parameter property is used.
:math:`D` is the hidden size. See usage for details in :ref:`api_fluid_ParamAttr` .
- All elements in the weight matrix can be divided into two parts. is_reverse(bool, optional): Whether to compute in the reversed order of
The first part are weights of the update gate and reset gate with input sequences. Default False.
shape :math:`(D \\times 2D)`, and the second part are weights for gate_activation(str, optional): The activation fuction corresponding to
candidate hidden state with shape :math:`(D \\times D)`. :math:`act_g` in the formula. "sigmoid", "tanh", "relu" and "identity"
are supported. Default "sigmoid".
If it is set to None or one attribute of ParamAttr, dynamic_gru will candidate_activation(str, optional): The activation fuction corresponding to
create ParamAttr as param_attr. If the Initializer of the param_attr :math:`act_c` in the formula. "sigmoid", "tanh", "relu" and "identity"
is not set, the parameter is initialized with Xavier. Default: None. are supported. Default "tanh".
bias_attr (ParamAttr|bool|None): The parameter attribute for the bias h_0 (Variable, optional): A Tensor representing the initial hidden state.
of GRU.Note that the bias with :math:`(1 \\times 3D)` concatenates It not provided, the default initial hidden state is 0. The shape is
the bias in the update gate, reset gate and candidate calculations. :math:`[N, D]` , where :math:`N` is the number of sequences in the
If it is set to False, no bias will be applied to the update gate, mini-batch, :math:`D` for the hidden size. The data type should be
reset gate and candidate calculations. If it is set to None or one same as ``input`` . Default None.
attribute of ParamAttr, dynamic_gru will create ParamAttr as
bias_attr. If the Initializer of the bias_attr is not set, the bias
is initialized zero. Default: None.
is_reverse(bool): Whether to compute reversed GRU, default
:attr:`False`.
gate_activation(str): The activation for update gate and reset gate.
Choices = ["sigmoid", "tanh", "relu", "identity"], default "sigmoid".
candidate_activation(str): The activation for candidate hidden state.
Choices = ["sigmoid", "tanh", "relu", "identity"], default "tanh".
h_0 (Variable): This is initial hidden state. If not set, default is
zero. This is a tensor with shape (N x D), where N is the number of
total time steps of input mini-batch feature and D is the hidden
size.
Returns: Returns:
Variable: The hidden state of GRU. The shape is :math:`(T \\times D)`, \ Variable: A LoDTensor whose lod level is 1 and shape is :math:`[T, D]` , \
and sequence length is the same with the input. where :math:`T` stands for the total sequence lengths in this mini-batch \
:math:`D` for the hidden size. It represents GRU transformed sequence output, \
and has the same lod and data type with ``input`` .
Examples: Examples:
...@@ -1307,9 +1308,11 @@ def dynamic_gru(input, ...@@ -1307,9 +1308,11 @@ def dynamic_gru(input,
import paddle.fluid as fluid import paddle.fluid as fluid
dict_dim, emb_dim = 128, 64 dict_dim, emb_dim = 128, 64
data = fluid.layers.data(name='sequence', shape=[1], data = fluid.data(name='sequence',
dtype='int32', lod_level=1) shape=[None],
emb = fluid.layers.embedding(input=data, size=[dict_dim, emb_dim]) dtype='int64',
lod_level=1)
emb = fluid.embedding(input=data, size=[dict_dim, emb_dim])
hidden_dim = 512 hidden_dim = 512
x = fluid.layers.fc(input=emb, size=hidden_dim * 3) x = fluid.layers.fc(input=emb, size=hidden_dim * 3)
hidden = fluid.layers.dynamic_gru(input=x, size=hidden_dim) hidden = fluid.layers.dynamic_gru(input=x, size=hidden_dim)
...@@ -1365,79 +1368,85 @@ def gru_unit(input, ...@@ -1365,79 +1368,85 @@ def gru_unit(input,
gate_activation='sigmoid', gate_activation='sigmoid',
origin_mode=False): origin_mode=False):
""" """
**GRU unit layer** Gated Recurrent Unit (GRU) RNN cell. This operator performs GRU calculations for
one time step and it supports these two modes:
if origin_mode is True, then the equation of a gru step is from paper If ``origin_mode`` is True, then the formula used is from paper
`Learning Phrase Representations using RNN Encoder-Decoder for Statistical `Learning Phrase Representations using RNN Encoder Decoder for Statistical
Machine Translation <https://arxiv.org/pdf/1406.1078.pdf>`_ Machine Translation <https://arxiv.org/pdf/1406.1078.pdf>`_ .
.. math:: .. math::
u_t & = actGate(xu_{t} + W_u h_{t-1} + b_u)
r_t & = actGate(xr_{t} + W_r h_{t-1} + b_r) u_t & = act_g(W_{ux}x_{t} + W_{uh}h_{t-1} + b_u)
r_t & = act_g(W_{rx}x_{t} + W_{rh}h_{t-1} + b_r)
m_t & = actNode(xm_t + W_c dot(r_t, h_{t-1}) + b_m) \\tilde{h_t} & = act_c(W_{cx}x_{t} + W_{ch}(r_t \odot h_{t-1}) + b_c)
h_t & = u_t \odot h_{t-1} + (1-u_t) \odot \\tilde{h_t}
h_t & = dot(u_t, h_{t-1}) + dot((1-u_t), m_t)
if origin_mode is False, then the equation of a gru step is from paper if ``origin_mode`` is False, then the formula used is from paper
`Empirical Evaluation of Gated Recurrent Neural Networks on Sequence `Empirical Evaluation of Gated Recurrent Neural Networks on Sequence
Modeling <https://arxiv.org/pdf/1412.3555.pdf>`_ Modeling <https://arxiv.org/pdf/1412.3555.pdf>`_
.. math:: .. math::
u_t & = actGate(xu_{t} + W_u h_{t-1} + b_u)
r_t & = actGate(xr_{t} + W_r h_{t-1} + b_r) u_t & = act_g(W_{ux}x_{t} + W_{uh}h_{t-1} + b_u)
m_t & = actNode(xm_t + W_c dot(r_t, h_{t-1}) + b_m) r_t & = act_g(W_{rx}x_{t} + W_{rh}h_{t-1} + b_r)
h_t & = dot((1-u_t), h_{t-1}) + dot(u_t, m_t) \\tilde{h_t} & = act_c(W_{cx}x_{t} + W_{ch}(r_t \odot h_{t-1}) + b_c)
h_t & = (1-u_t) \odot h_{t-1} + u_t \odot \\tilde{h_t}
The inputs of gru unit includes :math:`z_t`, :math:`h_{t-1}`. In terms :math:`x_t` is the input of current time step, but it is not ``input`` .
of the equation above, the :math:`z_t` is split into 3 parts - This operator does not include the calculations :math:`W_{ux}x_{t}, W_{rx}x_{t}, W_{cx}x_{t}` ,
:math:`xu_t`, :math:`xr_t` and :math:`xm_t`. This means that in order to **Note** thus a fully-connect layer whose size is 3 times of GRU hidden size should
implement a full GRU unit operator for an input, a fully be used before this operator, and the output should be used as ``input`` here.
connected layer has to be applied, such that :math:`z_t = W_{fc}x_t`. :math:`h_{t-1}` is the hidden state from previous time step.
:math:`u_t` , :math:`r_t` , :math:`\\tilde{h_t}` and :math:`h_t` stand for
update gate, reset gate, candidate hidden and hidden output separately.
:math:`W_{uh}, b_u` , :math:`W_{rh}, b_r` and :math:`W_{ch}, b_c` stand for
the weight matrix and bias used in update gate, reset gate, candidate hidden
calculations. For implementation, the three weight matrix are merged into a
tensor shaped :math:`[D, D \\times 3]` , the three bias are concatenated as
a tensor shaped :math:`[1, D \\times 3]` , where :math:`D` stands for the
hidden size; The data layout of weight tensor is: :math:`W_{uh}` and :math:`W_{rh}`
are concatenated with shape :math:`[D, D \\times 2]` lying on the first part,
and :math:`W_{ch}` lying on the latter part with shape :math:`[D, D]` .
The terms :math:`u_t` and :math:`r_t` represent the update and reset gates
of the GRU cell. Unlike LSTM, GRU has one lesser gate. However, there is
an intermediate candidate hidden output, which is denoted by :math:`m_t`.
This layer has three outputs :math:`h_t`, :math:`dot(r_t, h_{t-1})`
and concatenation of :math:`u_t`, :math:`r_t` and :math:`m_t`.
Args: Args:
input (Variable): The fc transformed input value of current step. input(Variable): A 2D Tensor representing the input after linear projection
hidden (Variable): The hidden value of gru unit from previous step. after linear projection. Its shape should be :math:`[N, D \\times 3]` ,
size (integer): The input dimension value. where :math:`N` stands for batch size, :math:`D` for the hidden size.
param_attr(ParamAttr|None): The parameter attribute for the learnable The data type should be float32 or float64.
hidden-hidden weight matrix. Note: hidden(Variable): A 2D Tensor representing the hidden state from previous step.
Its shape should be :math:`[N, D]` , where :math:`N` stands for batch size,
- The shape of the weight matrix is :math:`(T \\times 3D)`, where :math:`D` for the hidden size. The data type should be same as ``input`` .
:math:`D` is the hidden size. size(int): Indicate the hidden size.
- All elements in the weight matrix can be divided into two parts. param_attr(ParamAttr, optional): To specify the weight parameter property.
The first part are weights of the update gate and reset gate with Default: None, which means the default weight parameter property is used.
shape :math:`(D \\times 2D)`, and the second part are weights for See usage for details in :ref:`api_fluid_ParamAttr` .
candidate hidden state with shape :math:`(D \\times D)`. bias_attr (ParamAttr, optional): To specify the bias parameter property.
Default: None, which means the default bias parameter property is used.
If it is set to None or one attribute of ParamAttr, gru_unit will See usage for details in :ref:`api_fluid_ParamAttr` .
create ParamAttr as param_attr. If the Initializer of the param_attr activation(str, optional): The activation fuction corresponding to
is not set, the parameter is initialized with Xavier. Default: None. :math:`act_c` in the formula. "sigmoid", "tanh", "relu" and "identity"
bias_attr (ParamAttr|bool|None): The parameter attribute for the bias are supported. Default "tanh".
of GRU.Note that the bias with :math:`(1 \\times 3D)` concatenates gate_activation(str, optional): The activation fuction corresponding to
the bias in the update gate, reset gate and candidate calculations. :math:`act_g` in the formula. "sigmoid", "tanh", "relu" and "identity"
If it is set to False, no bias will be applied to the update gate, are supported. Default "sigmoid".
reset gate and candidate calculations. If it is set to None or one
attribute of ParamAttr, gru_unit will create ParamAttr as
bias_attr. If the Initializer of the bias_attr is not set, the bias
is initialized zero. Default: None.
activation (string): The activation type for cell (actNode).
Default: 'tanh'
gate_activation (string): The activation type for gates (actGate).
Default: 'sigmoid'
Returns: Returns:
tuple: The hidden value, reset-hidden value and gate values. tuple: The tuple contains three Tensor variables with the same data type \
as ``input`` . They represent the hidden state for next time step ( :math:`h_t` ), \
reseted previous hidden state ( :math:`r_t \odot h_{t-1}` ), and the \
concatenation of :math:`h_t, r_t, \\tilde{h_t}` . And they have shape \
:math:`[N, D]` , :math:`[N, D]` , :math:`[N, D \times 3]` separately. \
Usually only the hidden state for next time step ( :math:`h_t` ) is used \
as output and state, the other two are intermediate results of calculations.
Examples: Examples:
...@@ -1446,12 +1455,12 @@ def gru_unit(input, ...@@ -1446,12 +1455,12 @@ def gru_unit(input,
import paddle.fluid as fluid import paddle.fluid as fluid
dict_dim, emb_dim = 128, 64 dict_dim, emb_dim = 128, 64
data = fluid.layers.data(name='step_data', shape=[1], dtype='int32') data = fluid.data(name='step_data', shape=[None], dtype='int64')
emb = fluid.layers.embedding(input=data, size=[dict_dim, emb_dim]) emb = fluid.embedding(input=data, size=[dict_dim, emb_dim])
hidden_dim = 512 hidden_dim = 512
x = fluid.layers.fc(input=emb, size=hidden_dim * 3) x = fluid.layers.fc(input=emb, size=hidden_dim * 3)
pre_hidden = fluid.layers.data( pre_hidden = fluid.data(
name='pre_hidden', shape=[hidden_dim], dtype='float32') name='pre_hidden', shape=[None, hidden_dim], dtype='float32')
hidden = fluid.layers.gru_unit( hidden = fluid.layers.gru_unit(
input=x, hidden=pre_hidden, size=hidden_dim * 3) input=x, hidden=pre_hidden, size=hidden_dim * 3)
...@@ -2028,17 +2037,14 @@ def chunk_eval(input, ...@@ -2028,17 +2037,14 @@ def chunk_eval(input,
excluded_chunk_types=None, excluded_chunk_types=None,
seq_length=None): seq_length=None):
""" """
**Chunk Evaluator** This operator computes the precision, recall and F1-score for chunk detection.
It is often used in sequence tagging tasks, such as Named Entity Recognition(NER).
This function computes and outputs the precision, recall and
F1-score of chunk detection.
For some basics of chunking, please refer to For some basics of chunking, please refer to
`Chunking with Support Vector Machines <https://aclanthology.info/pdf/N/N01/N01-1025.pdf>`_ . `Chunking with Support Vector Machines <https://aclanthology.info/pdf/N/N01/N01-1025.pdf>`_ .
ChunkEvalOp computes the precision, recall, and F1-score of chunk detection, This operator supports IOB, IOE, IOBES and IO (also known as plain) tagging schemes.
and supports IOB, IOE, IOBES and IO (also known as plain) tagging schemes. Here is a NER example for the usage of these tagging schemes:
Here is a NER example of labeling for these tagging schemes:
.. code-block:: python .. code-block:: python
...@@ -2052,11 +2058,11 @@ def chunk_eval(input, ...@@ -2052,11 +2058,11 @@ def chunk_eval(input,
====== ====== ====== ===== == ============ ===== ===== ===== == ========= ====== ====== ====== ===== == ============ ===== ===== ===== == =========
There are three chunk types(named entity types) including PER(person), ORG(organization) There are three chunk types(named entity types) including PER(person), ORG(organization)
and LOC(LOCATION), and we can see that the labels have the form <tag type>-<chunk type>. and LOC(location), and we can see that the labels have the form `<tag type>-<chunk type>` .
Since the calculations actually use label ids rather than labels, extra attention Since the implementation of this operator actually uses label ids rather than
should be paid when mapping labels to ids to make CheckEvalOp work. The key point label strings, to make it work, there should be a way to map label ids to
is that the listed equations are satisfied by ids. tag types and chunk types. This operator uses the following way to do mapping:
.. code-block:: python .. code-block:: python
...@@ -2074,8 +2080,8 @@ def chunk_eval(input, ...@@ -2074,8 +2080,8 @@ def chunk_eval(input,
IOE - 0 1 - IOE - 0 1 -
IOBES 0 1 2 3 IOBES 0 1 2 3
Still use NER as example, assuming the tagging scheme is IOB while chunk types are ORG, Accordingly, in the above NER example, if the tagging scheme is IOB and chunk
PER and LOC. To satisfy the above equations, the label map can be like this: types are ORG, PER and LOC, then the label ids would be as follows:
.. code-block:: python .. code-block:: python
...@@ -2087,23 +2093,32 @@ def chunk_eval(input, ...@@ -2087,23 +2093,32 @@ def chunk_eval(input,
I-LOC 5 I-LOC 5
O 6 O 6
It's not hard to verify the equations noting that the num of chunk types With which we can map each label id to the corresponding tag type and chunk
is 3 and the num of tag types in IOB scheme is 2. For example, the label type correctly.
id of I-LOC is 5, the tag type id of I-LOC is 1, and the chunk type id of
I-LOC is 2, which consistent with the results from the equations.
Args: Args:
input (Variable): prediction output of the network. input (Variable): A Tensor or LoDTensor, representing the predicted labels
label (Variable): label of the test data set. from the network. When it is a Tensor, its shape would be `[N, M, 1]`,
chunk_scheme (str): ${chunk_scheme_comment} where `N` stands for batch size, `M` for sequence length; When it is
num_chunk_types (int): ${num_chunk_types_comment} a LoDTensor, its shape would be `[N, 1]` where `N` stands for the total
excluded_chunk_types (list): ${excluded_chunk_types_comment} sequence lengths in this mini-batch. The data type should be int64.
seq_length(Variable): 1-D Tensor specifying sequence length when input and label are Tensor type. label (Variable): A Tensor or LoDTensor representing the ground-truth labels.
It shoud have the same shape, lod and data type as ``input`` .
chunk_scheme (str): Indicate the tagging schemes used here. The value must
be IOB, IOE, IOBES or plain.
num_chunk_types (int): The number of chunk types.
excluded_chunk_types (list, optional): Indicate the chunk types shouldn't
be taken into account. It should be a list of chunk type ids(integer).
Default None.
seq_length(Variable, optional): A 1D Tensor containing the length of each
sequence when ``input`` and ``label`` are Tensor. It needn't be
provided if ``input`` and ``label`` are LoDTensor. Default None.
Returns: Returns:
tuple: tuple containing: precision, recall, f1_score, tuple: A tuple including precision, recall, F1-score, chunk number detected, \
num_infer_chunks, num_label_chunks, chunk number in ground-truth, chunk number correctly detected. Each \
num_correct_chunks is a Tensor with shape `[1]`. The data type of precision, recall and \
F1-score all is float32, and the others' data type all is int64.
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -2112,9 +2127,9 @@ def chunk_eval(input, ...@@ -2112,9 +2127,9 @@ def chunk_eval(input,
dict_size = 10000 dict_size = 10000
label_dict_len = 7 label_dict_len = 7
sequence = fluid.layers.data( sequence = fluid.data(
name='id', shape=[1], lod_level=1, dtype='int64') name='id', shape=[-1, 1], lod_level=1, dtype='int64')
embedding = fluid.layers.embedding( embedding = fluid.embedding(
input=sequence, size=[dict_size, 512]) input=sequence, size=[dict_size, 512])
hidden = fluid.layers.fc(input=embedding, size=512) hidden = fluid.layers.fc(input=embedding, size=512)
label = fluid.layers.data( label = fluid.layers.data(
...@@ -5644,64 +5659,71 @@ def beam_search(pre_ids, ...@@ -5644,64 +5659,71 @@ def beam_search(pre_ids,
Refer to `Beam search <https://en.wikipedia.org/wiki/Beam_search>`_ Refer to `Beam search <https://en.wikipedia.org/wiki/Beam_search>`_
for more details. for more details.
This layer does the search in beams for one time step. Specifically, it **This operator only supports LoDTensor.** It is used after finishing
selects the top-K candidate word ids of current step from :attr:`ids` scores calculation to perform beam search for one time step. Specifically,
according to their :attr:`scores` for all source sentences, where K is after ``ids`` and ``scores`` have been produced, it selects the top-K
:attr:`beam_size` and :attr:`ids, scores` are predicted results from the ( `k` is ``beam_size`` ) candidate word ids of current step from ``ids``
computation cell. If :attr:`ids` is not set, it will be calculated out according to the correspongding ``scores``. Additionally, ``pre_id`` and
according to :attr:`scores`. Additionally, :attr:`pre_ids` and ``pre_scores`` are the output of `beam_search` at previous step, they
:attr:`pre_scores` are the output of beam_search at previous step, they
are needed for special use to handle ended candidate translations. are needed for special use to handle ended candidate translations.
Note that if :attr:`is_accumulated` is :attr:`True`, the :attr:`scores` Note that if ``is_accumulated`` is True, the ``scores`` passed in should
passed in should be accumulated scores. Else, the :attr:`scores` are be accumulated scores. Otherwise, the ``scores`` are
considered as the straightforward scores and will be transformed to the considered as the probabilities of single step and would be transformed to
log field and accumulated the :attr:`pre_scores` in this operator. the log field and added up with ``pre_scores`` for final scores in this
Length penalty should be done with extra operators before calculating the operator. Length penalty should be done with extra operators before calculating
accumulated scores if needed. the accumulated scores if needed.
Please see the following demo for a fully beam search usage example: Please see the following demo for a fully beam search usage example:
fluid/tests/book/test_machine_translation.py fluid/tests/book/test_machine_translation.py
Args: Args:
pre_ids(Variable): The LodTensor variable which is the output of pre_ids(Variable): A LodTensor variable (lod level is 2), representing
beam_search at previous step. It should be a LodTensor with shape the selected ids of previous step. It is the output of beam_search
:math:`(batch_size, 1)` and lod at previous step. Its shape is `[batch_size, 1]` and its lod is
:math:`[[0, 1, ... , batch_size], [0, 1, ..., batch_size]]` at the `[[0, 1, ... , batch_size], [0, 1, ..., batch_size]]` at the
first step. first step. The data type should be int64.
pre_scores(Variable): The LodTensor variable which is the output of pre_scores(Variable): A LodTensor variable has the same shape and lod
beam_search at previous step. with ``pre_ids`` , representing the accumulated scores corresponding
ids(Variable): The LodTensor variable containing the candidates ids. to the selected ids of previous step. It is the output of
Its shape should be :math:`(batch_size \\times beam_size, K)`, beam_search at previous step. The data type should be float32.
where :math:`K` supposed to be :attr:`beam_size`. ids(Variable|None): A LodTensor variable containing the candidates ids.
scores(Variable): The LodTensor variable containing the accumulated It has the same lod with ``pre_ids`` and its shape should be
scores corresponding to :attr:`ids` and its shape is the same as `[batch_size * beam_size, K]`, where `K` supposed to be greater than
the shape of :attr:`ids`. ``beam_size`` and the first dimension size (decrease as samples reach
to the end) should be same as that of ``pre_ids`` . The data type
should be int64. It can be None, which use indice in ``scores`` as
ids.
scores(Variable): A LodTensor variable containing the accumulated
scores corresponding to ``ids`` . Both its shape and lod are same as
thoes of ``ids`` . The data type should be float32.
beam_size(int): The beam width used in beam search. beam_size(int): The beam width used in beam search.
end_id(int): The id of end token. end_id(int): The id of end token.
level(int, default 0): It can be ignored and mustn't change currently. level(int): **It can be ignored and mustn't change currently.**
It means the source level of lod, which is explained as following. The 2 level lod used in this operator has the following
The lod level of :attr:`ids` should be 2. The first level is source meaning: The first level describes how many beams each sample has,
level which describes how many prefixes (branchs) for each source which would change to 0 when beams of the sample all end (batch reduce);
sentece (beam), and the second level is sentence level which The second level describes how many times each beam is selected.
describes how these candidates belong to the prefix. The paths Default 0, which shouldn't be changed currently.
linking prefixes and selected candidates are organized and reserved is_accumulated(bool): Whether the input ``score`` is accumulated scores.
in lod. Default True.
is_accumulated(bool, default True): Whether the input :attr:`score` is name(str, optional): For detailed information, please refer
accumulated scores. to :ref:`api_guide_Name`. Usually name is no need to set and
name(str|None): A name for this layer(optional). If set None, the layer None by default.
will be named automatically. return_parent_idx(bool, optional): Whether to return an extra Tensor variable
return_parent_idx(bool): Whether to return an extra Tensor variable in output, which stores the selected ids' parent indice in
preserving the selected_ids' parent indice in pre_ids ``pre_ids`` and can be used to update RNN's states by gather operator.
in output, which can be used to gather cell states at Default False.
the next time step.
Returns: Returns:
Variable: The LodTensor tuple containing the selected ids and the \ tuple: The tuple contains two or three LodTensor variables. The two LodTensor, \
corresponding scores. If :attr:`return_parent_idx` is :attr:`True`, \ representing the selected ids and the corresponding accumulated scores of \
an extra Tensor variable preserving the selected_ids' parent indice \ current step, have the same shape `[batch_size, beam_size]` and lod with 2 levels, \
is included. and have data types int64 and float32. If ``return_parent_idx`` is True, \
an extra Tensor variable preserving the selected ids' parent indice \
is included, whose shape is `[batch_size * beam_size]` and data type \
is int64.
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -5713,12 +5735,12 @@ def beam_search(pre_ids, ...@@ -5713,12 +5735,12 @@ def beam_search(pre_ids,
# at previous step. # at previous step.
beam_size = 4 beam_size = 4
end_id = 1 end_id = 1
pre_ids = fluid.layers.data( pre_ids = fluid.data(
name='pre_id', shape=[1], lod_level=2, dtype='int64') name='pre_id', shape=[None, 1], lod_level=2, dtype='int64')
pre_scores = fluid.layers.data( pre_scores = fluid.data(
name='pre_scores', shape=[1], lod_level=2, dtype='float32') name='pre_scores', shape=[None, 1], lod_level=2, dtype='float32')
probs = fluid.layers.data( probs = fluid.data(
name='probs', shape=[10000], dtype='float32') name='probs', shape=[None, 10000], dtype='float32')
topk_scores, topk_indices = fluid.layers.topk(probs, k=beam_size) topk_scores, topk_indices = fluid.layers.topk(probs, k=beam_size)
accu_scores = fluid.layers.elementwise_add( accu_scores = fluid.layers.elementwise_add(
x=fluid.layers.log(x=topk_scores), x=fluid.layers.log(x=topk_scores),
...@@ -5772,28 +5794,46 @@ def beam_search(pre_ids, ...@@ -5772,28 +5794,46 @@ def beam_search(pre_ids,
def beam_search_decode(ids, scores, beam_size, end_id, name=None): def beam_search_decode(ids, scores, beam_size, end_id, name=None):
""" """
Beam Search Decode Layer. This layer constructs the full hypotheses for This operator is used after beam search has completed. It constructs the
each source sentence by walking back along the LoDTensorArray :attr:`ids` full predicted sequences for each sample by walking back along the search
whose lods can be used to restore the path in the beam search tree. paths stored in lod of ``ids`` . The result sequences are stored in a
LoDTensor, which uses the following way to parse:
.. code-block:: text
If lod = [[0, 3, 6], [0, 12, 24, 40, 54, 67, 82]]
The first level of lod stands for: There are 2 samples each having 3
(beam width) predicted sequence.
The second level of lod stands for: The lengths of the first sample's
3 predicted sequences are 12, 12, 16; The lengths of the second sample's
3 predicted sequences are 14, 13, 15.
Please see the following demo for a fully beam search usage example: Please see the following demo for a fully beam search usage example:
fluid/tests/book/test_machine_translation.py fluid/tests/book/test_machine_translation.py
Args: Args:
ids(Variable): The LodTensorArray variable containing the selected ids ids(Variable): The LoDTensorArray variable containing the selected ids
of all steps. of all steps. Each LoDTensor in it has int64 data type and 2 level
scores(Variable): The LodTensorArray variable containing the selected lod which can be used to get the search paths.
scores of all steps. scores(Variable): The LodTensorArray variable containing the accumulated
scores corresponding to selected ids of all steps. It has the same size
as ``ids`` . Each LoDTensor in it has the same shape and lod as the
counterpart in ``ids`` , and has a float32 data type.
beam_size(int): The beam width used in beam search. beam_size(int): The beam width used in beam search.
end_id(int): The id of end token. end_id(int): The id of end token.
name(str|None): A name for this layer(optional). If set None, the layer name(str, optional): For detailed information, please refer
will be named automatically. to :ref:`api_guide_Name`. Usually name is no need to set and
None by default.
Returns: Returns:
Variable: The LodTensor pair containing the generated id sequences \ tuple: The tuple contains two LodTensor variables. The two LodTensor, \
and the corresponding scores. The shapes and lods of the two \ containing the full sequences of ids and the correspongding accumulated \
LodTensor are same. The lod level is 2 and the two levels \ scores, have the same shape flattened to 1D and have the same 2 level \
separately indicate how many hypotheses each source sentence has \ lod. The lod can be used to get how many predicted sequences each sample \
and how many ids each hypothesis has. has and how many ids each predicted sequence has.
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -5832,71 +5872,67 @@ def lstm_unit(x_t, ...@@ -5832,71 +5872,67 @@ def lstm_unit(x_t,
param_attr=None, param_attr=None,
bias_attr=None, bias_attr=None,
name=None): name=None):
"""Lstm unit layer. The equation of a lstm step is: """
Long-Short Term Memory (LSTM) RNN cell. This operator performs LSTM calculations for
.. math:: one time step, whose implementation is based on calculations described in `RECURRENT
NEURAL NETWORK REGULARIZATION <http://arxiv.org/abs/1409.2329>`_ .
i_t & = \sigma(W_{x_i}x_{t} + W_{h_i}h_{t-1} + b_i)
f_t & = \sigma(W_{x_f}x_{t} + W_{h_f}h_{t-1} + b_f)
c_t & = f_tc_{t-1} + i_t tanh (W_{x_c}x_t + W_{h_c}h_{t-1} + b_c)
o_t & = \sigma(W_{x_o}x_{t} + W_{h_o}h_{t-1} + b_o)
h_t & = o_t tanh(c_t)
The inputs of lstm unit include :math:`x_t`, :math:`h_{t-1}` and We add forget_bias to the biases of the forget gate in order to
:math:`c_{t-1}`. The 2nd dimensions of :math:`h_{t-1}` and :math:`c_{t-1}` reduce the scale of forgetting. The formula is as follows:
should be same. The implementation separates the linear transformation and
non-linear transformation apart. Here, we take :math:`i_t` as an example.
The linear transformation is applied by calling a `fc` layer and the
equation is:
.. math:: .. math::
L_{i_t} = W_{x_i}x_{t} + W_{h_i}h_{t-1} + b_i i_{t} & = \sigma(W_{x_{i}}x_{t} + W_{h_{i}}h_{t-1} + b_{i})
The non-linear transformation is applied by calling `lstm_unit_op` and the f_{t} & = \sigma(W_{x_{f}}x_{t} + W_{h_{f}}h_{t-1} + b_{f} + forget\\_bias)
equation is:
.. math:: c_{t} & = f_{t}c_{t-1} + i_{t} tanh (W_{x_{c}}x_{t} + W_{h_{c}}h_{t-1} + b_{c})
i_t = \sigma(L_{i_t}) o_{t} & = \sigma(W_{x_{o}}x_{t} + W_{h_{o}}h_{t-1} + b_{o})
This layer has two outputs including :math:`h_t` and :math:`c_t`. h_{t} & = o_{t} tanh (c_{t})
:math:`x_{t}` stands for ``x_t`` , corresponding to the input of current time step;
:math:`h_{t-1}` and :math:`c_{t-1}` correspond to ``hidden_t_prev`` and ``cell_t_prev`` ,
representing the output of from previous time step.
:math:`i_{t}, f_{t}, c_{t}, o_{t}, h_{t}` are input gate, forget gate, cell, output gate
and hidden calculation.
Args: Args:
x_t (Variable): The input value of current step, a 2-D tensor with shape x_t(Variable): A 2D Tensor representing the input of current time step.
M x N, M for batch size and N for input size. Its shape should be :math:`[N, M]` , where :math:`N` stands for batch
hidden_t_prev (Variable): The hidden value of lstm unit, a 2-D tensor size, :math:`M` for the feature size of input. The data type should
with shape M x S, M for batch size and S for size of lstm unit. be float32 or float64.
cell_t_prev (Variable): The cell value of lstm unit, a 2-D tensor with hidden_t_prev(Variable): A 2D Tensor representing the hidden value from
shape M x S, M for batch size and S for size of lstm unit. previous step. Its shape should be :math:`[N, D]` , where :math:`N`
forget_bias (float): The forget bias of lstm unit. stands for batch size, :math:`D` for the hidden size. The data type
param_attr(ParamAttr|None): The parameter attribute for the learnable should be same as ``x_t`` .
hidden-hidden weights. cell_t_prev(Variable): A 2D Tensor representing the cell value from
If it is set to None or one attribute of ParamAttr, previous step. It has the same shape and data type with ``hidden_t_prev`` .
lstm_unit will create ParamAttr as param_attr. forget_bias (float, optional): :math:`forget\\_bias` added to the biases
If the Initializer of the param_attr is not set, the of the forget gate. Default 0.
parameter is initialized with Xavier. Default: None. param_attr(ParamAttr, optional): To specify the weight parameter property.
bias_attr (ParamAttr|None): The bias attribute for the learnable bias Default: None, which means the default weight parameter property is used.
weights. If it is set to False, no bias will be added See usage for details in :ref:`api_fluid_ParamAttr` .
to the output units. If it is set to None or one attribute of ParamAttr, bias_attr (ParamAttr, optional): To specify the bias parameter property.
lstm_unit will create ParamAttr as bias_attr. Default: None, which means the default bias parameter property is used.
If the Initializer of the bias_attr is not set, See usage for details in :ref:`api_fluid_ParamAttr` .
the bias is initialized zero. Default: None. name(str, optional): For detailed information, please refer
name(str|None): A name for this layer(optional). If set None, the layer to :ref:`api_guide_Name`. Usually name is no need to set and
will be named automatically. None by default.
Returns: Returns:
tuple: The hidden value and cell value of lstm unit. tuple: The tuple contains two Tensor variables with the same shape and \
data type with ``hidden_t_prev`` , representing the hidden value and \
cell value which correspond to :math:`h_{t}` and :math:`c_{t}` in \
the formula.
Raises: Raises:
ValueError: The ranks of **x_t**, **hidden_t_prev** and **cell_t_prev** ValueError: Rank of x_t must be 2.
not be 2 or the 1st dimensions of **x_t**, **hidden_t_prev** ValueError: Rank of hidden_t_prev must be 2.
and **cell_t_prev** not be the same or the 2nd dimensions of ValueError: Rank of cell_t_prev must be 2.
**hidden_t_prev** and **cell_t_prev** not be the same. ValueError: The 1st dimensions of x_t, hidden_t_prev and cell_t_prev must be the same.
ValueError: The 2nd dimensions of hidden_t_prev and cell_t_prev must be the same.
Examples: Examples:
...@@ -5905,12 +5941,12 @@ def lstm_unit(x_t, ...@@ -5905,12 +5941,12 @@ def lstm_unit(x_t,
import paddle.fluid as fluid import paddle.fluid as fluid
dict_dim, emb_dim, hidden_dim = 128, 64, 512 dict_dim, emb_dim, hidden_dim = 128, 64, 512
data = fluid.layers.data(name='step_data', shape=[1], dtype='int32') data = fluid.data(name='step_data', shape=[None], dtype='int64')
x = fluid.layers.embedding(input=data, size=[dict_dim, emb_dim]) x = fluid.embedding(input=data, size=[dict_dim, emb_dim])
pre_hidden = fluid.layers.data( pre_hidden = fluid.data(
name='pre_hidden', shape=[hidden_dim], dtype='float32') name='pre_hidden', shape=[None, hidden_dim], dtype='float32')
pre_cell = fluid.layers.data( pre_cell = fluid.data(
name='pre_cell', shape=[hidden_dim], dtype='float32') name='pre_cell', shape=[None, hidden_dim], dtype='float32')
hidden = fluid.layers.lstm_unit( hidden = fluid.layers.lstm_unit(
x_t=x, x_t=x,
hidden_t_prev=pre_hidden, hidden_t_prev=pre_hidden,
...@@ -15035,12 +15071,13 @@ def teacher_student_sigmoid_loss(input, ...@@ -15035,12 +15071,13 @@ def teacher_student_sigmoid_loss(input,
def add_position_encoding(input, alpha, beta, name=None): def add_position_encoding(input, alpha, beta, name=None):
""" """
**Add Position Encoding Layer** This operator performs weighted sum of input feature at each position
(position in the sequence) and the corresponding position encoding.
This layer accepts an input 3D-Tensor of shape [N x M x P], and returns an For more details of position encoding, please refer to `Attention Is All You
output Tensor of shape [N x M x P] with positional encoding value. Need <http://arxiv.org/pdf/1706.03762.pdf>`_ .
Refer to `Attention Is All You Need <http://arxiv.org/pdf/1706.03762.pdf>`_ . The formula is as follows:
.. math:: .. math::
PE(pos, 2i) &= \\sin{(pos / 10000^{2i / P})} \\\\ PE(pos, 2i) &= \\sin{(pos / 10000^{2i / P})} \\\\
...@@ -15048,28 +15085,36 @@ def add_position_encoding(input, alpha, beta, name=None): ...@@ -15048,28 +15085,36 @@ def add_position_encoding(input, alpha, beta, name=None):
Out(:, pos, i) &= \\alpha * input(:, pos, i) + \\beta * PE(pos, i) Out(:, pos, i) &= \\alpha * input(:, pos, i) + \\beta * PE(pos, i)
Where: Where:
- :math:`PE(pos, 2i)` : the increment for the number at even position - :math:`PE(pos, 2i)` : the value at even index `2i` for encoding of position `pos`.
- :math:`PE(pos, 2i + 1)` : the increment for the number at odd position - :math:`PE(pos, 2i + 1)` : the value at odd index `2i+1` for encoding of position `pos`
Args: Args:
input (Variable): 3-D input tensor with shape [N x M x P] input(Variable): A Tensor or LoDTensor (lod level is 1). If it is a
alpha (float): multiple of Input Tensor Tensor, the shape should be `[N, M, P]`, where `N` stands for
beta (float): multiple of Positional Encoding Tensor batch size, `M` for sequence length, `P` for the size of feature
name (string): the name of position encoding layer dimension. If it is a LoDTensor, the shape should be `[N, P]`,
where `N` stands for the total sequence lengths in this mini-batch,
`P` for the size of feature. The data type should be float32 or float64.
alpha(float): Indicate the weight coefficient for `input` when performing
weighted sum.
beta(float): Indicate the weight coefficient for position encoding when
performing weighted sum.
name(str, optional): For detailed information, please refer
to :ref:`api_guide_Name`. Usually name is no need to set and
None by default.
Returns: Returns:
Variable: A 3-D Tensor of shape [N x M x P] with positional encoding. Variable: A Tensor or LoDTensor. It has the same shape, data type and lod as `input`.
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle.fluid as fluid import paddle.fluid as fluid
tensor = fluid.layers.data( tensor = fluid.data(
name='tensor', name='tensor',
shape=[32, 64, 512], shape=[None, 64, 512],
dtype='float32', dtype='float32')
append_batch_size=False)
position_tensor = fluid.layers.add_position_encoding( position_tensor = fluid.layers.add_position_encoding(
input=tensor, alpha=1.0, beta=1.0) input=tensor, alpha=1.0, beta=1.0)
......
...@@ -363,67 +363,67 @@ def scaled_dot_product_attention(queries, ...@@ -363,67 +363,67 @@ def scaled_dot_product_attention(queries,
num_heads=1, num_heads=1,
dropout_rate=0.): dropout_rate=0.):
""" """
The dot-product attention. This interface Multi-Head Attention using scaled dot product.
Attention mechanism can be seen as mapping a query and a set of key-value Attention mechanism can be seen as mapping a query and a set of key-value
pairs to an output. The output is computed as a weighted sum of the values, pairs to an output. Multi-Head Attention performs attention using multi-head
where the weight assigned to each value is computed by a compatibility parallel, and the inputs of attention would be transformed by linear projection.
function (dot-product here) of the query with the corresponding key. The formula is as follows:
The dot-product attention can be implemented through (batch) matrix
multipication as follows:
.. math:: .. math::
Attention(Q, K, V)= softmax(QK^\mathrm{T})V MultiHead(Q, K, V ) & = Concat(head_1, ..., head_h)
where \ head_i & = Attention(QW_i^Q , KW_i^K , VW_i^V )
Refer to `Attention Is All You Need Attention(Q, K, V) & = softmax (\\frac{QK^\mathrm{T}}{\sqrt{d_k}}) V
<https://arxiv.org/pdf/1706.03762.pdf>`_.
For more details, please refer to `Attention Is All You Need
<https://arxiv.org/pdf/1706.03762.pdf>`_ .
Note that the implementation is adapted to batch, and all matrix multiplication
in :math:`Attention(Q, K, V)` is batched matrix multiplication. Refer to
:ref:`api_fluid_layers_matmul` .
Args: Args:
queries (Variable): The input variable which should be a 3-D Tensor. queries (Variable): A 3-D Tensor with shape :math:`[N, L_q, d_k \\times h]` ,
keys (Variable): The input variable which should be a 3-D Tensor. where :math:`N` stands for batch size, :math:`L_q` for the sequence length
values (Variable): The input variable which should be a 3-D Tensor. of query, :math:`d_k \\times h` for the feature size of query, :math:`h` for
num_heads (int): Head number to compute the scaled dot product head number. The data type should be float32 or float64.
attention. Default: 1. keys (Variable): A 3-D Tensor with shape :math:`[N, L_k, d_k \\times h]` ,
dropout_rate (float): The dropout rate to drop the attention weight. where :math:`N` stands for batch size, :math:`L_k` for the sequence length
Default: 0.0. of key, :math:`d_k \\times h` for the feature size of key, :math:`h` for head
number. The data type should be the same as ``queries`` .
values (Variable): A 3-D Tensor with shape :math:`[N, L_k, d_v \\times h]` ,
where :math:`N` stands for batch size, :math:`L_k` for the sequence length
of key, :math:`d_v \\times h` for the feature size of value, :math:`h` for head
number. The data type should be the same as ``queries`` .
num_heads (int, optional): Indicate the number of head. If the numher
is 1, linear projection would not be performed on inputs. Default: 1.
dropout_rate (float, optional): The rate to drop the attention weight.
Default: 0.0, which means no dropout.
Returns: Returns:
Variable: A 3-D Tensor computed by multi-head scaled dot product\ Variable: A 3-D Tensor with shape :math:`[N, L_q, d_v \\times h]` , \
attention. where :math:`N` stands for batch size, :math:`L_q` for the sequence \
length of query, :math:`d_v \\times h` for the feature size of value. \
It has the same data type with inputs, representing the output of \
Multi-Head Attention.
Raises: Raises:
ValueError: If input queries, keys, values are not 3-D Tensors. ValueError: Inputs quries, keys and values should all be 3-D tensors.
ValueError: The hidden size of queries and keys should be the same.
NOTES: ValueError: The max sequence length in query batch and in key batch should be the same.
1. When num_heads > 1, three linear projections are learned respectively ValueError: he hidden size of keys must be divisible by the number of attention heads.
to map input queries, keys and values into queries', keys' and values'. ValueError: he hidden size of values must be divisible by the number of attention heads.
queries', keys' and values' have the same shapes with queries, keys
and values.
2. When num_heads == 1, scaled_dot_product_attention has no learnable
parameters.
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle.fluid as fluid import paddle.fluid as fluid
queries = fluid.layers.data(name="queries", queries = fluid.data(name="queries", shape=[3, 5, 9], dtype="float32")
shape=[3, 5, 9], keys = fluid.data(name="keys", shape=[3, 6, 9], dtype="float32")
dtype="float32", values = fluid.data(name="values", shape=[3, 6, 10], dtype="float32")
append_batch_size=False)
queries.stop_gradient = False
keys = fluid.layers.data(name="keys",
shape=[3, 6, 9],
dtype="float32",
append_batch_size=False)
keys.stop_gradient = False
values = fluid.layers.data(name="values",
shape=[3, 6, 10],
dtype="float32",
append_batch_size=False)
values.stop_gradient = False
contexts = fluid.nets.scaled_dot_product_attention(queries, keys, values) contexts = fluid.nets.scaled_dot_product_attention(queries, keys, values)
contexts.shape # [3, 5, 10] contexts.shape # [3, 5, 10]
""" """
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册