From 4615c5172c82b44c60a6b6839f323559bf93448d Mon Sep 17 00:00:00 2001 From: luotao1 Date: Thu, 29 Sep 2016 10:06:27 +0800 Subject: [PATCH] beam search api and unitest in hierarchical rnn (#122) --- demo/seqToseq/seqToseq_net.py | 11 +-- doc/algorithm/rnn/rnn.rst | 20 +++-- .../tests/rnn_gen_test_model_dir/r1.test.nest | 16 ++++ .../tests/sample_trainer_nest_rnn_gen.conf | 73 +++++++++++++++++++ .../trainer/tests/sample_trainer_rnn_gen.conf | 7 +- .../test_recurrent_machine_generation.cpp | 62 +++++++++++----- .../trainer_config_helpers/evaluators.py | 33 +++++++-- .../paddle/trainer_config_helpers/layers.py | 36 +-------- 8 files changed, 184 insertions(+), 74 deletions(-) create mode 100644 paddle/trainer/tests/rnn_gen_test_model_dir/r1.test.nest create mode 100644 paddle/trainer/tests/sample_trainer_nest_rnn_gen.conf diff --git a/demo/seqToseq/seqToseq_net.py b/demo/seqToseq/seqToseq_net.py index a9c0dd4af..2b0c3f346 100644 --- a/demo/seqToseq/seqToseq_net.py +++ b/demo/seqToseq/seqToseq_net.py @@ -171,12 +171,13 @@ def gru_encoder_decoder(data_conf, beam_gen = beam_search(name=decoder_group_name, step=gru_decoder_with_attention, input=group_inputs, - id_input=data_layer(name="sent_id", - size=1), - dict_file=trg_dict_path, bos_id=0, eos_id=1, beam_size=beam_size, - max_length=max_length, - result_file=gen_trans_file) + max_length=max_length) + + seqtext_printer_evaluator(input=beam_gen, + id_input=data_layer(name="sent_id", size=1), + dict_file=trg_dict_path, + result_file=gen_trans_file) outputs(beam_gen) diff --git a/doc/algorithm/rnn/rnn.rst b/doc/algorithm/rnn/rnn.rst index 4753db450..0ab75a130 100644 --- a/doc/algorithm/rnn/rnn.rst +++ b/doc/algorithm/rnn/rnn.rst @@ -202,14 +202,17 @@ After training the model, we can use it to generate sequences. A common practice * use :code:`GeneratedInput` for trg_embedding. :code:`GeneratedInput` computes the embedding of the generated token at the last time step for the input at the current time step. * use :code:`beam_search` function. This function needs to set: - - :code:`id_input`: the integer ID of the data, used to identify the corresponding output in the generated files. - - :code:`dict_file`: the dictionary file for converting word id to word. - :code:`bos_id`: the start token. Every sentence starts with the start token. - :code:`eos_id`: the end token. Every sentence ends with the end token. - :code:`beam_size`: the beam size used in beam search. - :code:`max_length`: the maximum length of the generated sentences. - - :code:`result_file`: the path of the generation result file. +* use :code:`seqtext_printer_evaluator` to print text according to index matrix and dictionary. This function needs to set: + + - :code:`id_input`: the integer ID of the data, used to identify the corresponding output in the generated files. + - :code:`dict_file`: the dictionary file for converting word id to word. + - :code:`result_file`: the path of the generation result file. + The code is listed below: .. code-block:: python @@ -230,14 +233,15 @@ The code is listed below: beam_gen = beam_search(name=decoder_group_name, step=gru_decoder_with_attention, input=group_inputs, - id_input=data_layer(name="sent_id", - size=1), - dict_file=trg_dict_path, bos_id=0, # Beginnning token. eos_id=1, # End of sentence token. beam_size=beam_size, - max_length=max_length, - result_file=gen_trans_file) + max_length=max_length) + + seqtext_printer_evaluator(input=beam_gen, + id_input=data_layer(name="sent_id", size=1), + dict_file=trg_dict_path, + result_file=gen_trans_file) outputs(beam_gen) diff --git a/paddle/trainer/tests/rnn_gen_test_model_dir/r1.test.nest b/paddle/trainer/tests/rnn_gen_test_model_dir/r1.test.nest new file mode 100644 index 000000000..02c7f142a --- /dev/null +++ b/paddle/trainer/tests/rnn_gen_test_model_dir/r1.test.nest @@ -0,0 +1,16 @@ +0 1 2 3 4 + 1 2 3 4 + 1 2 3 4 + 1 2 3 4 + 1 2 3 4 + 1 2 3 4 + 1 2 3 4 + 1 2 3 4 + 1 2 3 4 + 1 2 3 4 + 1 2 3 4 + 1 2 3 4 + 1 2 3 4 + 1 2 3 4 + 1 2 3 4 + diff --git a/paddle/trainer/tests/sample_trainer_nest_rnn_gen.conf b/paddle/trainer/tests/sample_trainer_nest_rnn_gen.conf new file mode 100644 index 000000000..613fd325e --- /dev/null +++ b/paddle/trainer/tests/sample_trainer_nest_rnn_gen.conf @@ -0,0 +1,73 @@ +#edit-mode: -*- python -*- +# Copyright (c) 2016 Baidu, Inc. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from paddle.trainer_config_helpers import * + +settings(batch_size=15, learning_rate=0) + +num_words = 5 +beam_flag = get_config_arg('beam_search', bool, False) + +sent_id = data_layer(name="sent_id", size=1) + +# This layer has no actual use, but only to decide batch_size in generation. +# When generating, at least one Memory in RecurrentLayer MUST have a boot layer. +dummy_data = data_layer(name="dummy_data_input", size=2) + +def outer_step(dummy_data): + + gen_inputs = [StaticInput(input=dummy_data, size=2, is_seq=True), + GeneratedInput(size=num_words, + embedding_name="wordvec", + embedding_size=num_words)] + + def inner_step(dummy_memory, predict_word): + + # simplified RNN for testing + with mixed_layer(size=num_words) as layer: + layer += full_matrix_projection(input=predict_word, + param_attr=ParamAttr(name="transtable")) + + with mixed_layer(size=num_words, act=ExpActivation()) as out: + out += trans_full_matrix_projection(input=layer, + param_attr=ParamAttr(name="wordvec")) + + return out + + beam_gen = beam_search(name="rnn_gen", + step=inner_step, + input=gen_inputs, + bos_id=0, + eos_id=num_words-1, + beam_size=2 if beam_flag else 1, + num_results_per_sample=2 if beam_flag else 1, + max_length=10) + return beam_gen + +beam_gen_concat = recurrent_group(name="rnn_gen_concat", + step=outer_step, + input=[SubsequenceInput(dummy_data)]) + +seqtext_printer_evaluator(input=beam_gen_concat, + id_input=sent_id, + dict_file="./trainer/tests/test_gen_dict.txt", + result_file="./trainer/tests/dump_text.test") +#outputs(beam_gen_concat) +# In this config, as dummy_data_input doesn't work on beam_gen (we can find dummy_memory +# is read-only memory, and isn't used by other layers of step), we show the Inputs and Outputs +# as follows. Note that "__beam_search_predict__" is the default output name of beam_search. +Inputs("sent_id","dummy_data_input") +Outputs("__beam_search_predict__") diff --git a/paddle/trainer/tests/sample_trainer_rnn_gen.conf b/paddle/trainer/tests/sample_trainer_rnn_gen.conf index abb6e9b17..ec1c12cc8 100644 --- a/paddle/trainer/tests/sample_trainer_rnn_gen.conf +++ b/paddle/trainer/tests/sample_trainer_rnn_gen.conf @@ -48,15 +48,16 @@ def step(dummy_memory, predict_word): beam_gen = beam_search(name="rnn_gen", step=step, input=gen_inputs, - id_input=sent_id, - dict_file="./trainer/tests/test_gen_dict.txt", - result_file="./trainer/tests/dump_text.test", bos_id=0, eos_id=num_words-1, beam_size=2 if beam_flag else 1, num_results_per_sample=2 if beam_flag else 1, max_length=10) +seqtext_printer_evaluator(input=beam_gen, + id_input=sent_id, + dict_file="./trainer/tests/test_gen_dict.txt", + result_file="./trainer/tests/dump_text.test") #outputs(beam_gen) # In this config, as dummy_data_input doesn't work on beam_gen (we can find dummy_memory # is read-only memory, and isn't used by other layers of step), we show the Inputs and Outputs diff --git a/paddle/trainer/tests/test_recurrent_machine_generation.cpp b/paddle/trainer/tests/test_recurrent_machine_generation.cpp index cf52c568e..fcee318d1 100644 --- a/paddle/trainer/tests/test_recurrent_machine_generation.cpp +++ b/paddle/trainer/tests/test_recurrent_machine_generation.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include #include @@ -24,6 +23,8 @@ using namespace paddle; // NOLINT using namespace std; // NOLINT static const string& CONFIG_FILE = "trainer/tests/sample_trainer_rnn_gen.conf"; +static const string& NEST_CONFIG_FILE = + "trainer/tests/sample_trainer_nest_rnn_gen.conf"; static const string& OUTPUT_DIR = "trainer/tests/dump_text.test"; static string modelDir = "trainer/tests/rnn_gen_test_model_dir/t1"; // NOLINT static string expectFile = // NOLINT @@ -50,32 +51,52 @@ void checkOutput(const string& expRetFile) { } } -void prepareInArgs(vector& inArgs, - const size_t batchSize, bool useGpu) { +void prepareInArgs(vector& inArgs, const size_t batchSize, + bool useGpu, bool hasSubseq) { inArgs.clear(); // sentence id Argument sentId; sentId.value = nullptr; - IVector::resizeOrCreate(sentId.ids, batchSize, useGpu); - for (size_t i = 0; i < batchSize; ++i) sentId.ids->setElement(i, i); + if (hasSubseq) { + // as there is only one sequence, there is only one label. + IVector::resizeOrCreate(sentId.ids, 1, useGpu); + sentId.ids->setElement(0, 0); + } else { + // as there is batchSize word, there is batchSize label. + IVector::resizeOrCreate(sentId.ids, batchSize, useGpu); + for (size_t i = 0; i < batchSize; ++i) sentId.ids->setElement(i, i); + } inArgs.emplace_back(sentId); // a dummy layer to decide batch size Argument dummyInput; dummyInput.value = Matrix::create(batchSize, 2, false, useGpu); dummyInput.value->randomizeUniform(); + if (hasSubseq) { + // generate one sequence with batchSize subsequence, + // and each subsequence has only one word. + dummyInput.sequenceStartPositions = ICpuGpuVector::create(2, false); + int* buf = dummyInput.sequenceStartPositions->getMutableData(false); + dummyInput.subSequenceStartPositions = + ICpuGpuVector::create(batchSize + 1, false); + int* subBuf = dummyInput.subSequenceStartPositions->getMutableData(false); + buf[0] = 0; + buf[1] = batchSize; + for (size_t i = 0; i < batchSize + 1; i++) subBuf[i] = i; + } inArgs.emplace_back(dummyInput); } -void testGeneration(bool useGpu, const string& expRetFile) { +void testGeneration(const string& configFile, bool useGpu, bool hasSubseq, + const string& expRetFile) { FLAGS_use_gpu = useGpu; - auto config = std::make_shared(CONFIG_FILE); + auto config = std::make_shared(configFile); unique_ptr gradientMachine(GradientMachine::create(*config)); gradientMachine->loadParameters(modelDir); vector inArgs(2); const size_t batchSize = 15; - prepareInArgs(inArgs, batchSize, useGpu); + prepareInArgs(inArgs, batchSize, useGpu, hasSubseq); vector outArgs; unique_ptr testEvaluator(gradientMachine->makeEvaluator()); testEvaluator->start(); @@ -93,16 +114,21 @@ TEST(RecurrentGradientMachine, test_generation) { #else const auto useGpuConfs = {true, false}; #endif - FLAGS_config_args = "beam_search=0"; // no beam search - string expectRetFileNoBeam = expectFile + ".nobeam"; - for (auto useGpu : useGpuConfs) { - testGeneration(useGpu, expectRetFileNoBeam); - } - FLAGS_config_args = "beam_search=1"; // no beam search - string expectRetFileBeam = expectFile + ".beam"; - for (auto useGpu : useGpuConfs) { - testGeneration(useGpu, expectRetFileBeam); - } + auto testGen = [&](const string& configFile, bool hasSubseq, + const string& expRetFile, bool beam_search) { + FLAGS_config_args = beam_search ? "beam_search=1" : "beam_search=0"; + for (auto useGpu : useGpuConfs) { + testGeneration(configFile, useGpu, hasSubseq, expRetFile); + } + }; + testGen(CONFIG_FILE, false, expectFile + ".nobeam", false); // no beam search + testGen(CONFIG_FILE, false, expectFile + ".beam", true); // beam search + // In hierarchical RNN, beam search and one way search are only in inner-RNN, + // outer-RNN will concat the generated inner-results (first for beam search) + // from inner-RNN. Thus, they have the same outer-results. + testGen(NEST_CONFIG_FILE, true, expectFile + ".nest", + false); // no beam search + testGen(NEST_CONFIG_FILE, true, expectFile + ".nest", true); // beam search } #endif diff --git a/python/paddle/trainer_config_helpers/evaluators.py b/python/paddle/trainer_config_helpers/evaluators.py index 7a00d0b7e..ded124a5c 100644 --- a/python/paddle/trainer_config_helpers/evaluators.py +++ b/python/paddle/trainer_config_helpers/evaluators.py @@ -559,6 +559,7 @@ def maxframe_printer_evaluator( def seqtext_printer_evaluator( input, result_file, + id_input=None, dict_file=None, delimited=None, name=None, @@ -567,11 +568,10 @@ def seqtext_printer_evaluator( Sequence text printer will print text according to index matrix and a dictionary. There can be multiple input to this layer: - 1. If there is only one input, the input must be a matrix containing + 1. If there is no id_input, the input must be a matrix containing the sequence of indices; - 2. If there are more than one input, the first input should be ids, - and are interpreted as sample ids. + 2. If there is id_input, it should be ids, and interpreted as sample ids. The output format will be: @@ -602,26 +602,43 @@ def seqtext_printer_evaluator( .. code-block:: python - eval = seqtext_printer_evaluator(input, + eval = seqtext_printer_evaluator(input=maxid_layer, + id_input=sample_id, dict_file=dict_file, result_file=result_file) :param input: Input Layer name. :type input: LayerOutput|list - :param dict_file: The input dictionary which contains a list of tokens. - :type dict_file: basestring - :param result_file: The file is to save the results. + :param result_file: Path of the file to store the generated results. :type result_file: basestring + :param id_input: Index of the input sequence, and the specified index will + be prited in the gereated results. This an optional + parameter. + :type id_input: LayerOutput + :param dict_file: Path of dictionary. This is an optional parameter. + Every line is a word in the dictionary with + (line number - 1) as the word index. + If this parameter is set to None, or to an empty string, + only word index are printed in the generated results. + :type dict_file: basestring :param delimited: Whether to use space to separate output tokens. Default is True. No space is added if set to False. :type delimited: bool :param name: Evaluator name. :type name: None|basestring + :return: The seq_text_printer that prints the generated sequence to a file. + :rtype: evaluator """ assert isinstance(result_file, basestring) + if id_input is None: + inputs = [input] + else: + inputs = [id_input, input] + input.parents.append(id_input) + evaluator_base(name=name, type="seq_text_printer", - input=input, + input=inputs, dict_file=dict_file, result_file=result_file, delimited=delimited) diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py index b28dd02b7..c355dc042 100644 --- a/python/paddle/trainer_config_helpers/layers.py +++ b/python/paddle/trainer_config_helpers/layers.py @@ -2608,7 +2608,6 @@ def eos_layer(input, eos_id, name=None, layer_attr=None): @wrap_name_default() def beam_search(step, input, bos_id, eos_id, beam_size, - result_file, dict_file="", id_input=None, max_length=500, name=None, num_results_per_sample=None): """ @@ -2632,8 +2631,7 @@ def beam_search(step, input, bos_id, eos_id, beam_size, input=[StaticInput(encoder_last)], bos_id=0, eos_id=1, - beam_size=5, - result_file="./generated_sequences.txt") + beam_size=5) Please see the following demo for more details: @@ -2671,24 +2669,12 @@ def beam_search(step, input, bos_id, eos_id, beam_size, of the most promising next words. The greater the beam size, the fewer candidate words are pruned. :type beam_size: int - :param result_file: Path of the file to store the generated results. - :type result_file: basestring - :param dict_file: Path of dictionary. This is an optional parameter. - Every line is a word in the dictionary with - (line number - 1) as the word index. - If this parameter is set to None, or to an empty string, - only word index are printed in the generated results. - :type dict_file: basestring :param num_results_per_sample: Number of the generated results per input sequence. This number must always be less than beam size. :type num_results_per_sample: int - :param id_input: Index of the input sequence, and the specified index will - be prited in the gereated results. This an optional - parameter. - :type id_input: LayerOutput - :return: The seq_text_printer that prints the generated sequence to a file. - :rtype: evaluator + :return: The generated word index. + :rtype: LayerOutput """ if num_results_per_sample is None: @@ -2704,7 +2690,6 @@ def beam_search(step, input, bos_id, eos_id, beam_size, real_input = [] for i, each_input in enumerate(input): - # print type(each_input) assert isinstance(each_input, StaticInput) or isinstance( each_input, BaseGeneratedInput) if isinstance(each_input, BaseGeneratedInput): @@ -2740,20 +2725,7 @@ def beam_search(step, input, bos_id, eos_id, beam_size, tmp = recurrent_group(step=__real_step__, input=real_input, reverse=False, name=name) - - if id_input is None: - inputs = [tmp.name] - else: - assert isinstance(id_input, LayerOutput) - inputs = [id_input.name, tmp.name] - tmp.parents.append(id_input) - - Evaluator(name='target_printer', - type='seq_text_printer', - dict_file=dict_file, - result_file=result_file, - inputs=inputs - ) + return tmp -- GitLab