diff --git a/demo/seqToseq/seqToseq_net.py b/demo/seqToseq/seqToseq_net.py index 8b613de71ade4d05b36bd3a254ca14877d1bc876..08183e1f75d339ff70c0fb5130b94508d8b86344 100644 --- a/demo/seqToseq/seqToseq_net.py +++ b/demo/seqToseq/seqToseq_net.py @@ -158,12 +158,15 @@ def gru_encoder_decoder(data_conf, is_seq=True), StaticInput(input=encoded_proj, is_seq=True), ] - # In generation, decoder predicts a next target word based on + # In generation, the decoder predicts a next target word based on # the encoded source sequence and the last generated target word. + # The encoded source sequence (encoder's output) must be specified by - # StaticInput which is a read-only memory. - # Here, GeneratedInputs automatically fetchs the last generated word, - # which is initialized by a start mark, such as . + # StaticInput, which is a read-only memory. + # Embedding of the last generated word is automatically gotten by + # GeneratedInputs, which is initialized by a start mark, such as , + # and must be included in generation. + trg_embedding = GeneratedInput( size=target_dict_dim, embedding_name='_target_language_embedding', diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py index 1eaf26fdbf5eaeec85ddca7b0364f3ce3e3ec9f8..6e7964c12c171faab16524d9391b0dff0362a774 100644 --- a/python/paddle/trainer_config_helpers/layers.py +++ b/python/paddle/trainer_config_helpers/layers.py @@ -2198,7 +2198,8 @@ def recurrent_group(step, input, reverse=False, name=None): :type input: LayerOutput|StaticInput|SubsequenceInput|list|tuple - :param reverse: Reverse is true, rnn will process sequence reversely. + :param reverse: If reverse is set true, the recurrent unit will process the + input sequence in a reverse order. :type reverse: bool :return: Layer output object :rtype: LayerOutput @@ -2372,6 +2373,84 @@ def beam_search(step, input, bos_id, eos_id, beam_size, result_file, dict_file="", id_input=None, max_length=500, name=None, num_results_per_sample=None): + """ + Beam search is a heuristic search algorithm used in sequence generation. + It explores a graph by expanding the most promising nodes in a limited set + to maintain tractability. + + The example usage is: + + .. code-block:: python + + def rnn_step(input): + last_time_step_output = memory(name='rnn', size=512) + with mixed_layer(size=512) as simple_rnn: + simple_rnn += full_matrix_projection(input) + simple_rnn += last_time_step_output + return simple_rnn + + beam_gen = beam_search(name="decoder", + step=rnn_step, + input=[StaticInput("encoder_last")], + bos_id=0, + eos_id=1, + beam_size=5, + result_file="./generated_sequences.txt") + + Please see the following demo for more details: + + - machine translation : demo/seqToseq/translation/gen.conf \ + demo/seqToseq/seqToseq_net.py + + :param name: Name of the recurrent unit that generates sequences. + :type name: base string + :param step: A callable function that defines the calculation in a time + step, and it is appled to sequences with arbitrary length by + sharing a same set of weights. + + You can refer to the first parameter of recurrent_group, or + demo/seqToseq/seqToseq_net.py for more details. + :type step: callable + :param input: Input data for the recurrent unit + :type input: StaticInput|GeneratedInput + :param bos_id: Index of the start symbol in the dictionary. The start symbol + is a special token for NLP task, which indicates the + beginning of a sequence. In the generation task, the start + symbol is ensential, since it is used to initialize the RNN + internal state. + :type bos_id: int + :param eos_id: Index of the end symbol in the dictionary. The end symbol is + a special token for NLP task, which indicates the end of a + sequence. The generation process will stop once the end + symbol is generated, or a pre-defined max iteration number + is exceeded. + :type eos_id: int + :param beam_size: Beam search for sequence generation is an iterative search + algorithm. To maintain tractability, every iteration only + only stores a predetermined number, called the beam_size, + of the most promising next words. The greater the beam + size, the fewer candidate words are pruned. + :type beam_size: int + :param result_file: Path of the file to store the generated results. + :type result_file: basestring + :param dict_file: Path of dictionary. This is an optional parameter. + Every line is a word in the dictionary with + (line number - 1) as the word index. + If this parameter is set to None, or to an empty string, + only word index are printed in the generated results. + :type dict_file: basestring + :param num_results_per_sample: Number of the generated results per input + sequence. This number must always be less than + beam size. + :type num_results_per_sample: int + :param id_input: Index of the input sequence, and the specified index will + be prited in the gereated results. This an optional + parameter. + :type id_input: LayerOutput + :return: The seq_text_printer that prints the generated sequence to a file. + :rtype: evaluator + """ + if num_results_per_sample is None: num_results_per_sample = beam_size if num_results_per_sample > beam_size: diff --git a/python/paddle/trainer_config_helpers/networks.py b/python/paddle/trainer_config_helpers/networks.py index b162304b91861acf626ee48a26ca045acf2d283b..1d0a1d52a9f943006713135c19af628beb43d74c 100644 --- a/python/paddle/trainer_config_helpers/networks.py +++ b/python/paddle/trainer_config_helpers/networks.py @@ -787,7 +787,7 @@ def simple_attention(encoded_sequence, name=None): """ Calculate and then return a context vector by attention machanism. - Size of the context vector equals to size of encoded_sequence. + Size of the context vector equals to size of the encoded_sequence. .. math:: @@ -795,7 +795,7 @@ def simple_attention(encoded_sequence, e_{i,j} & = a(s_{i-1}, h_{j}) - a_{i,j} & = \\frac{exp(e_{i,i})}{\\sum_{k=1}^{T_{x}{exp(e_{i,k})}}} + a_{i,j} & = \\frac{exp(e_{i,j})}{\\sum_{k=1}^{T_x}{exp(e_{i,k})}} c_{i} & = \\sum_{j=1}^{T_{x}}a_{i,j}h_{j}