From 8a8df06b7b3a9f4fb48cf95e611cf496ac165ad4 Mon Sep 17 00:00:00 2001 From: Guo Sheng Date: Wed, 27 Jun 2018 11:09:17 +0800 Subject: [PATCH] Merge pull request #11238 from guoshengCS/fix-beam_search Fix and enhance beam_search_op and beam_searc_decode_op --- .../fluid/operators/beam_search_decode_op.cc | 84 ++++-- .../fluid/operators/beam_search_decode_op.h | 269 +++++++----------- .../operators/beam_search_decode_op_test.cc | 148 ++-------- paddle/fluid/operators/beam_search_op.cc | 137 ++++++--- paddle/fluid/operators/beam_search_op.h | 46 +-- paddle/fluid/operators/beam_search_op_test.cc | 15 +- .../operators/tensor_array_read_write_op.cc | 5 +- python/paddle/fluid/layers/nn.py | 194 ++++++++----- .../test_machine_translation.py | 22 +- .../tests/book/test_machine_translation.py | 22 +- .../unittests/test_beam_search_decode_op.py | 80 +++--- .../tests/unittests/test_beam_search_op.py | 30 +- 12 files changed, 545 insertions(+), 507 deletions(-) diff --git a/paddle/fluid/operators/beam_search_decode_op.cc b/paddle/fluid/operators/beam_search_decode_op.cc index c3dd22119dd..10d678111f5 100644 --- a/paddle/fluid/operators/beam_search_decode_op.cc +++ b/paddle/fluid/operators/beam_search_decode_op.cc @@ -12,8 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/beam_search_decode_op.h" +#include #include + +#include "paddle/fluid/operators/beam_search_decode_op.h" #include "paddle/fluid/platform/device_context.h" namespace paddle { @@ -22,8 +24,11 @@ namespace operators { struct BeamSearchDecodeFunctor { BeamSearchDecodeFunctor(const LoDTensorArray& step_ids, const LoDTensorArray& step_scores, - LoDTensor* id_tensor, LoDTensor* score_tensor) - : step_ids_origin_(step_ids), + LoDTensor* id_tensor, LoDTensor* score_tensor, + size_t beam_size, int end_id) + : beam_size_(beam_size), + end_id_(end_id), + step_ids_origin_(step_ids), step_scores_origin_(step_scores), id_tensor_(id_tensor), score_tensor_(score_tensor) { @@ -37,9 +42,11 @@ struct BeamSearchDecodeFunctor { // Copy all tensors in the input tensor array for (auto& step_id : step_ids_origin_) { framework::LoDTensor out; - dev_ctx->Wait(); - framework::TensorCopy(step_id, platform::CPUPlace(), *dev_ctx, &out); - dev_ctx->Wait(); + if (step_id.numel() > 0) { + dev_ctx->Wait(); + framework::TensorCopy(step_id, platform::CPUPlace(), *dev_ctx, &out); + dev_ctx->Wait(); + } out.set_lod(step_id.lod()); step_ids_.push_back(out); @@ -53,9 +60,12 @@ struct BeamSearchDecodeFunctor { // Copy all tensors in the input tensor array for (auto& step_score : step_scores_origin_) { framework::LoDTensor out; - dev_ctx->Wait(); - framework::TensorCopy(step_score, platform::CPUPlace(), *dev_ctx, &out); - dev_ctx->Wait(); + if (step_score.numel() > 0) { + dev_ctx->Wait(); + framework::TensorCopy(step_score, platform::CPUPlace(), *dev_ctx, + &out); + dev_ctx->Wait(); + } out.set_lod(step_score.lod()); step_scores_.push_back(out); @@ -67,6 +77,8 @@ struct BeamSearchDecodeFunctor { void operator()() const; bool tensor_on_gpu_; + size_t beam_size_; + int end_id_; const LoDTensorArray& step_ids_origin_; const LoDTensorArray& step_scores_origin_; LoDTensorArray step_ids_ = LoDTensorArray(); @@ -77,14 +89,14 @@ struct BeamSearchDecodeFunctor { template void BeamSearchDecodeFunctor::operator()() const { - BeamSearchDecoder beam_search_decoder; + BeamSearchDecoder beam_search_decoder(beam_size_, end_id_); // Check if the tensor is on GPU. If so, use the CPU copy instead if (tensor_on_gpu_) { - beam_search_decoder.PackAllSteps(step_ids_, step_scores_, id_tensor_, - score_tensor_); + beam_search_decoder.Backtrace(step_ids_, step_scores_, id_tensor_, + score_tensor_); } else { - beam_search_decoder.PackAllSteps(step_ids_origin_, step_scores_origin_, - id_tensor_, score_tensor_); + beam_search_decoder.Backtrace(step_ids_origin_, step_scores_origin_, + id_tensor_, score_tensor_); } } @@ -122,13 +134,17 @@ class BeamSearchDecodeOp : public framework::OperatorBase { "Level of LodTensor should be 2"); } + size_t beam_size = ctx.Attr("beam_size"); + int end_id = ctx.Attr("end_id"); + // prepare output LoDTensor* sentenceIds = ctx.Output("SentenceIds"); LoDTensor* sentenceScores = ctx.Output("SentenceScores"); framework::VisitDataType( framework::ToDataType(scores->at(0).type()), - BeamSearchDecodeFunctor(*ids, *scores, sentenceIds, sentenceScores)); + BeamSearchDecodeFunctor(*ids, *scores, sentenceIds, sentenceScores, + beam_size, end_id)); } }; @@ -137,18 +153,32 @@ class BeamSearchDecodeOpProtoMaker : public framework::OpProtoAndCheckerMaker { void Make() override { AddInput("Ids", "(LodTensorArray)" - "score of the candidate words in each step"); + "The LodTensorArray containing the selected ids of all steps"); AddInput("Scores", "(LodTensorArray)" - "score of the candidate words in each step"); - AddOutput("SentenceIds", - "(LodTensor)" - "All possible result sentences of word ids"); - AddOutput("SentenceScores", - "(LodTensor)" - "All possible result sentences of word scores"); + "The LodTensorArray containing the selected scores of all steps"); + AddOutput( + "SentenceIds", + "(LodTensor)" + "An LodTensor containing all generated id sequences for all source " + "sentences"); + AddOutput( + "SentenceScores", + "(LodTensor)" + "An LodTensor containing scores corresponding to Output(SentenceIds)"); + AddAttr("beam_size", "beam size for beam search"); + AddAttr("end_id", + "the token id which indicates the end of a sequence"); AddComment(R"DOC( -Pack the result of Beam search op into SentenceIds and SentenceScores. +Beam Search Decode Operator. This Operator constructs the full hypotheses for +each source sentence by walking back along the LoDTensorArray Input(ids) +whose lods can be used to restore the path in the beam search tree. + +The Output(SentenceIds) and Output(SentenceScores) separately contain the +generated id sequences and the corresponding scores. The shapes and lods of the +two LodTensor are same. The lod level is 2 and the two levels separately +indicate how many hypotheses each source sentence has and how many ids each +hypothesis has. )DOC"); } }; @@ -172,10 +202,12 @@ class BeamSearchDecodeInferVarType : public framework::VarTypeInference { void operator()(const framework::OpDesc& op_desc, framework::BlockDesc* block) const override { for (auto& o : op_desc.Output("SentenceIds")) { - block->Var(o)->SetType(framework::proto::VarType::LOD_TENSOR); + auto& sentence_ids = block->FindRecursiveOrCreateVar(o); + sentence_ids.SetType(framework::proto::VarType::LOD_TENSOR); } for (auto& o : op_desc.Output("SentenceScores")) { - block->Var(o)->SetType(framework::proto::VarType::LOD_TENSOR); + auto& sentence_scores = block->FindRecursiveOrCreateVar(o); + sentence_scores.SetType(framework::proto::VarType::LOD_TENSOR); } } }; diff --git a/paddle/fluid/operators/beam_search_decode_op.h b/paddle/fluid/operators/beam_search_decode_op.h index 3c01f81c835..6aefc5446f1 100644 --- a/paddle/fluid/operators/beam_search_decode_op.h +++ b/paddle/fluid/operators/beam_search_decode_op.h @@ -14,7 +14,9 @@ limitations under the License. */ #pragma once +#include #include + #include "paddle/fluid/framework/lod_tensor_array.h" #include "paddle/fluid/framework/op_registry.h" @@ -25,42 +27,12 @@ using LoDTensor = framework::LoDTensor; using LoDTensorArray = framework::LoDTensorArray; // all the lod have 2 levels. -// The First is source level, the second is sentence level. -// source level describe how many candidate words for this source. -// sentence level describe these candidates belong to which prefix +// The first is source level, the second is sentence level. +// source level describe how many prefixes (branchs) for each source sentece +// (beam). sentence level describe how these candidates belong to the prefixes. const size_t kSourceLevel = 0; const size_t kSentenceLevel = 1; -template -struct BeamNode { - BeamNode(int64_t word_id, T score) : word_id_(word_id), score_(score) {} - - ~BeamNode() { - if (parent_) { - parent_->DropKid(this); - if (parent_->kids_.size() == 0UL) { - delete parent_; - } - } - VLOG(3) << "Delete BeamNode root with word_id:" << this->word_id_; - } - - void AppendTo(BeamNode* parent) { - parent_ = parent; - parent->kids_.insert(this); - } - - void DropKid(BeamNode* kid) { kids_.erase(kid); } - - BeamNode* parent_ = nullptr; - std::unordered_set kids_; - int64_t word_id_; - T score_; -}; - -template -using BeamNodeVector = std::vector>>; - template struct Sentence { std::vector word_ids; @@ -72,24 +44,8 @@ using SentenceVector = std::vector>; template struct BeamSearchDecoder { - /** - * make a BeamNode and all it's related prefix BeanNode into a Sentence. - */ - Sentence MakeSentence(const BeamNode* node) const; - - /** - * Param: - * cur_ids: LoDTensor of One step for word ID - * cur_scores: LoDTensor of One Step for word score - * prefixes_list: prefixes for each source sentence. - * sentence_vector_list: result sentence_vector for each source sentence. - * Return: - * a new prefixes list for each source of current step - */ - std::vector> PackTwoSteps( - const LoDTensor& cur_ids, const LoDTensor& cur_scores, - std::vector>* prefixes_list, - std::vector>* sentence_vector_list) const; + BeamSearchDecoder(size_t beam_size, int end_id) + : beam_size_(beam_size), end_id_(end_id) {} /** * convert the result sentence_vector for each source sentence into two @@ -100,107 +56,30 @@ struct BeamSearchDecoder { * sentence_vector_list: sentence_vector for each source sentence. * id_tensor: result LoDTensor for sentences of id. * score_tensor: result LoDTensor for sentences of score. + * reverse: whether ids of sentence in sentence_vector_list is reversed + * sort_by_score: whether to sort hypotheses of each sentence by scores. */ void ConvertSentenceVectorToLodTensor( std::vector> sentence_vector_list, LoDTensor* id_tensor, - LoDTensor* score_tensor) const; + LoDTensor* score_tensor, bool reverse = true, + bool sort_by_score = true) const; /** - * Pack all steps of id/score LodTensor into sentence LoDTensor - * it's main logic is: - * ```python - * prefix - * result_sentence - * result_lod_tensor - * - * for (step in steps): - * prefix = PackTwoSteps(prefix, step, &result_sentence) - * ConvertSentenceVectorToLodTensor(result_sentence, &result_lod_tensor) - * ``` + * Gather the hypotheses for each source sentence by backtrace though the + * LoDTensorArray step_ids whose lods reserve the path in the tree. */ - void PackAllSteps(const LoDTensorArray& step_ids, - const LoDTensorArray& step_scores, LoDTensor* id_tensor, - LoDTensor* score_tensor) const; -}; - -template -Sentence BeamSearchDecoder::MakeSentence(const BeamNode* node) const { - Sentence sentence; - while (node != nullptr) { - sentence.word_ids.emplace_back(node->word_id_); - sentence.scores.emplace_back(node->score_); - node = node->parent_; - } - - std::reverse(std::begin(sentence.word_ids), std::end(sentence.word_ids)); - std::reverse(std::begin(sentence.scores), std::end(sentence.scores)); - - return sentence; -} - -template -std::vector> BeamSearchDecoder::PackTwoSteps( - const LoDTensor& cur_ids, const LoDTensor& cur_scores, - std::vector>* prefixes_list, - std::vector>* sentence_vector_list) const { - std::vector> result; + void Backtrace(const LoDTensorArray& step_ids, + const LoDTensorArray& step_scores, LoDTensor* id_tensor, + LoDTensor* score_tensor) const; - for (size_t src_idx = 0; src_idx < cur_ids.lod()[kSourceLevel].size() - 1; - ++src_idx) { - size_t src_start = cur_ids.lod().at(kSourceLevel)[src_idx]; - size_t src_end = cur_ids.lod().at(kSourceLevel)[src_idx + 1]; - - BeamNodeVector beam_nodes; - - // if prefixes size is 0, it means this is the first step. In this step, - // all candidate id is the start of candidate sentences. - if (prefixes_list->empty()) { - PADDLE_ENFORCE_EQ(cur_ids.lod().at(kSourceLevel).back(), - cur_ids.lod().at(kSentenceLevel).back(), - "in the first step"); - for (size_t id_idx = src_start; id_idx < src_end; ++id_idx) { - beam_nodes.push_back(std::unique_ptr>(new BeamNode( - cur_ids.data()[id_idx], cur_scores.data()[id_idx]))); - } - } else { - BeamNodeVector& prefixes = prefixes_list->at(src_idx); - SentenceVector& sentence_vector = (*sentence_vector_list)[src_idx]; - - PADDLE_ENFORCE_EQ(src_end - src_start, prefixes.size(), - "prefix and candidate set number should be the same"); - - auto candidate_offset = cur_ids.lod()[kSentenceLevel]; - for (size_t prefix_idx = 0; prefix_idx < prefixes.size(); ++prefix_idx) { - std::unique_ptr>& prefix = prefixes[prefix_idx]; - size_t candidate_start = candidate_offset[src_start + prefix_idx]; - size_t candidate_end = candidate_offset[src_start + prefix_idx + 1]; - if (candidate_start == candidate_end) { - VLOG(3) << "this sentence has no more candidate, " - "add to result sentence and rm it from beam tree"; - sentence_vector.push_back(MakeSentence(prefix.get())); - prefix.reset(); - } else { - for (size_t candidate_idx = candidate_start; - candidate_idx < candidate_end; ++candidate_idx) { - auto* candidate = - new BeamNode(cur_ids.data()[candidate_idx], - cur_scores.data()[candidate_idx]); - candidate->AppendTo(prefix.get()); - beam_nodes.push_back(std::unique_ptr>(candidate)); - } - prefix.release(); - } - } - } - result.push_back(std::move(beam_nodes)); - } - return result; -} + size_t beam_size_; + int end_id_; +}; template void BeamSearchDecoder::ConvertSentenceVectorToLodTensor( std::vector> sentence_vector_list, LoDTensor* id_tensor, - LoDTensor* score_tensor) const { + LoDTensor* score_tensor, bool reverse, bool sort_by_score) const { size_t src_num = sentence_vector_list.size(); PADDLE_ENFORCE_NE(src_num, 0, "src_num should not be 0"); @@ -211,11 +90,29 @@ void BeamSearchDecoder::ConvertSentenceVectorToLodTensor( std::vector score_data; for (size_t src_idx = 0; src_idx < src_num; ++src_idx) { + if (sort_by_score) { + sort(sentence_vector_list[src_idx].begin(), + sentence_vector_list[src_idx].end(), + [reverse](const Sentence& a, const Sentence& b) { + if (reverse) + return a.scores.front() > b.scores.front(); + else + return a.scores.back() > b.scores.back(); + }); + } for (Sentence& sentence : sentence_vector_list[src_idx]) { - id_data.insert(id_data.end(), sentence.word_ids.begin(), - sentence.word_ids.end()); - score_data.insert(score_data.end(), sentence.scores.begin(), - sentence.scores.end()); + if (reverse) { + id_data.insert(id_data.end(), sentence.word_ids.rbegin(), + sentence.word_ids.rend()); + score_data.insert(score_data.end(), sentence.scores.rbegin(), + sentence.scores.rend()); + } else { + id_data.insert(id_data.end(), sentence.word_ids.begin(), + sentence.word_ids.end()); + score_data.insert(score_data.end(), sentence.scores.begin(), + sentence.scores.end()); + } + sentence_level_lod.push_back(sentence_level_lod.back() + sentence.word_ids.size()); } @@ -243,39 +140,75 @@ void BeamSearchDecoder::ConvertSentenceVectorToLodTensor( } template -void BeamSearchDecoder::PackAllSteps(const LoDTensorArray& step_ids, - const LoDTensorArray& step_scores, - LoDTensor* id_tensor, - LoDTensor* score_tensor) const { +void BeamSearchDecoder::Backtrace(const LoDTensorArray& step_ids, + const LoDTensorArray& step_scores, + LoDTensor* id_tensor, + LoDTensor* score_tensor) const { PADDLE_ENFORCE(!step_ids.empty(), "step num should be larger than 0"); PADDLE_ENFORCE_EQ(step_ids.size(), step_scores.size(), "step_ids and step_scores should be the same"); const size_t step_num = step_ids.size(); const size_t src_num = step_ids.at(0).lod().at(kSourceLevel).size() - 1; + std::vector> sentence_vector_list( + src_num, SentenceVector(beam_size_)); + std::vector> prefix_idx_vector_list(src_num); + for (int step_id = step_num - 1; step_id >= 0; --step_id) { + auto& cur_ids = step_ids.at(step_id); + auto& cur_scores = step_scores.at(step_id); + for (size_t src_idx = 0; src_idx < src_num; ++src_idx) { + // for each source sentence + auto& sentence_vector = sentence_vector_list.at(src_idx); + auto& prefix_idx_vector = prefix_idx_vector_list.at(src_idx); + size_t src_prefix_start = cur_ids.lod().at(kSourceLevel)[src_idx]; + size_t src_prefix_end = cur_ids.lod().at(kSourceLevel)[src_idx + 1]; + if (prefix_idx_vector.empty()) { // be finished and pruned at this step + // or the last time step + for (size_t prefix_idx = src_prefix_start; prefix_idx < src_prefix_end; + ++prefix_idx) { + size_t candidate_start = cur_ids.lod().at(kSentenceLevel)[prefix_idx]; + size_t candidate_end = + cur_ids.lod().at(kSentenceLevel)[prefix_idx + 1]; + for (size_t candidate_idx = candidate_start; + candidate_idx < candidate_end; ++candidate_idx) { + prefix_idx_vector.push_back(prefix_idx); + size_t idx = prefix_idx_vector.size() - 1; + auto cur_id = cur_ids.data()[candidate_idx]; + auto cur_score = cur_scores.data()[candidate_idx]; + sentence_vector.at(idx).word_ids.push_back(cur_id); + sentence_vector.at(idx).scores.push_back(cur_score); + } + } + } else { // use prefix_idx_vector to backtrace + size_t src_candidate_start = + cur_ids.lod().at(kSentenceLevel)[src_prefix_start]; + size_t prefix_idx = src_prefix_start; + size_t candidate_num = + cur_ids.lod().at(kSentenceLevel)[prefix_idx + 1] - + cur_ids.lod().at(kSentenceLevel)[prefix_idx]; + for (size_t idx = 0; idx < prefix_idx_vector.size(); ++idx) { + auto candidate_idx = prefix_idx_vector.at(idx); + auto cur_id = cur_ids.data()[candidate_idx]; + auto cur_score = cur_scores.data()[candidate_idx]; + if (cur_id != end_id_ || sentence_vector.at(idx).word_ids.empty()) { + // to skip redundant end tokens + sentence_vector.at(idx).word_ids.push_back(cur_id); + sentence_vector.at(idx).scores.push_back(cur_score); + } - PADDLE_ENFORCE_GT(src_num, 0UL, "source num should be larger than 0"); - - // previous prefixes for each step, - // the init length is 0, means this is the first step. - std::vector> beamnode_vector_list(0); - std::vector> sentence_vector_list(src_num); - - // pack all steps for one batch first, then another batch - for (size_t step_id = 0; step_id < step_num; ++step_id) { - beamnode_vector_list = - PackTwoSteps(step_ids.at(step_id), step_scores.at(step_id), - &beamnode_vector_list, &sentence_vector_list); - } - // append last beam_node to result - for (size_t src_idx = 0; src_idx < src_num; ++src_idx) { - for (auto& beam_node : beamnode_vector_list.at(src_idx)) { - sentence_vector_list[src_idx].push_back(MakeSentence(beam_node.get())); - beam_node.reset(); + while (src_candidate_start + candidate_num <= + candidate_idx) { // search the corresponding prefix + prefix_idx++; + candidate_num += cur_ids.lod().at(kSentenceLevel)[prefix_idx + 1] - + cur_ids.lod().at(kSentenceLevel)[prefix_idx]; + } + prefix_idx_vector.at(idx) = prefix_idx; + } + } } } ConvertSentenceVectorToLodTensor(sentence_vector_list, id_tensor, - score_tensor); + score_tensor, true, true); } } // namespace operators diff --git a/paddle/fluid/operators/beam_search_decode_op_test.cc b/paddle/fluid/operators/beam_search_decode_op_test.cc index 36f9594969c..88339e38d89 100644 --- a/paddle/fluid/operators/beam_search_decode_op_test.cc +++ b/paddle/fluid/operators/beam_search_decode_op_test.cc @@ -20,15 +20,11 @@ using LoD = paddle::framework::LoD; using LoDTensor = paddle::framework::LoDTensor; using LoDTensorArray = paddle::framework::LoDTensorArray; -template -using BeamNode = paddle::operators::BeamNode; template using BeamSearchDecoder = paddle::operators::BeamSearchDecoder; template using Sentence = paddle::operators::Sentence; template -using BeamNodeVector = paddle::operators::BeamNodeVector; -template using SentenceVector = paddle::operators::SentenceVector; namespace paddle { @@ -77,138 +73,50 @@ void GenerateExample(const std::vector& level_0, } // namespace test } // namespace paddle -TEST(BeamSearchDecodeOp, DeleteBeamNode) { - auto* root = new BeamNode(0, 0); - auto* b1 = new BeamNode(1, 1); - auto* b2 = new BeamNode(2, 2); - auto* b3 = new BeamNode(3, 3); - - b1->AppendTo(root); - b2->AppendTo(root); - b3->AppendTo(b1); - - delete b3; - delete b2; -} - -TEST(BeamSearchDecodeOp, MakeSentence) { - auto* root = new BeamNode(0, 0); - auto* b1 = new BeamNode(1, 1); - auto* end = new BeamNode(2, 2); - b1->AppendTo(root); - end->AppendTo(b1); - - BeamSearchDecoder helper; - Sentence sentence = helper.MakeSentence(end); - delete end; - - std::vector expect_ids = {0, 1, 2}; - ASSERT_EQ(sentence.word_ids, expect_ids); - - std::vector expect_scores = {0, 1, 2}; - ASSERT_EQ(sentence.scores, expect_scores); -} - -TEST(BeamSearchDecodeOp, PackTwoStepsFistStep) { - CPUPlace place; - - LoDTensorArray ids; - LoDTensorArray scores; - - paddle::test::GenerateExample( - std::vector{0, 2, 6}, std::vector{0, 1, 2, 3, 4, 5, 6}, - std::vector{1, 2, 3, 4, 5, 6}, &ids, &scores); - - std::vector> beamnode_vector_list; - std::vector> sentence_vector_list( - 2, SentenceVector()); - - BeamSearchDecoder helper; - beamnode_vector_list = helper.PackTwoSteps( - ids[0], scores[0], &beamnode_vector_list, &sentence_vector_list); - ASSERT_EQ(beamnode_vector_list.size(), 2UL); - ASSERT_EQ(beamnode_vector_list[0].size(), 2UL); - ASSERT_EQ(beamnode_vector_list[1].size(), 4UL); -} - -TEST(BeamSearchDecodeOp, PackTwoSteps) { - CPUPlace place; - - // first source has three prefix - BeamNodeVector source0_prefixes; - source0_prefixes.push_back( - std::unique_ptr>(new BeamNode(1, 1))); - source0_prefixes.push_back( - std::unique_ptr>(new BeamNode(0, 0))); - source0_prefixes.push_back( - std::unique_ptr>(new BeamNode(3, 3))); - - // second source has two prefix - BeamNodeVector source1_prefixes; - source1_prefixes.push_back( - std::unique_ptr>(new BeamNode(4, 4))); - source1_prefixes.push_back( - std::unique_ptr>(new BeamNode(5, 5))); - - std::vector> beamnode_vector_list; - std::vector> sentence_vector_list( - 2, SentenceVector()); - - beamnode_vector_list.push_back(std::move(source0_prefixes)); - beamnode_vector_list.push_back(std::move(source1_prefixes)); - - // generate data for one step - LoDTensorArray ids; - LoDTensorArray scores; - - paddle::test::GenerateExample(std::vector{0, 3, 5}, - std::vector{0, 1, 1, 3, 4, 5}, - std::vector{0, 1, 2, 3, 4}, &ids, &scores); - - BeamSearchDecoder helper1; - beamnode_vector_list = helper1.PackTwoSteps( - ids[0], scores[0], &beamnode_vector_list, &sentence_vector_list); - - ASSERT_EQ(sentence_vector_list[0].size(), 1UL); - ASSERT_EQ(sentence_vector_list[1].size(), 0UL); - ASSERT_EQ(beamnode_vector_list[0].size(), 3UL); - ASSERT_EQ(beamnode_vector_list[1].size(), 2UL); -} - -TEST(BeamSearchDecodeOp, PackAllSteps) { +TEST(BeamSearchDecodeOp, Backtrace) { CPUPlace place; - // we will constuct a sample data with 3 steps and 2 source sentences + // Construct sample data with 5 steps and 2 source sentences + // beam_size = 2, start_id = 0, end_id = 1 LoDTensorArray ids; LoDTensorArray scores; paddle::test::GenerateExample( - std::vector{0, 3, 6}, std::vector{0, 1, 2, 3, 4, 5, 6}, - std::vector{1, 2, 3, 4, 5, 6}, &ids, &scores); + std::vector{0, 1, 2}, std::vector{0, 1, 2}, + std::vector{0, 0}, &ids, &scores); // start with start_id + paddle::test::GenerateExample(std::vector{0, 1, 2}, + std::vector{0, 2, 4}, + std::vector{2, 3, 4, 5}, &ids, &scores); + paddle::test::GenerateExample(std::vector{0, 2, 4}, + std::vector{0, 2, 2, 4, 4}, + std::vector{3, 1, 5, 4}, &ids, &scores); + paddle::test::GenerateExample(std::vector{0, 2, 4}, + std::vector{0, 1, 2, 3, 4}, + std::vector{1, 1, 3, 5}, &ids, &scores); paddle::test::GenerateExample( - std::vector{0, 3, 6}, std::vector{0, 1, 1, 3, 5, 5, 6}, - std::vector{0, 1, 2, 3, 4, 5}, &ids, &scores); - paddle::test::GenerateExample(std::vector{0, 3, 6}, - std::vector{0, 0, 1, 2, 3, 4, 5}, - std::vector{0, 1, 2, 3, 4}, &ids, &scores); + std::vector{0, 2, 4}, + std::vector{0, 0, 0, 2, + 2}, // the branchs of the first source sentence + // are pruned since finished + std::vector{5, 1}, + &ids, &scores); - ASSERT_EQ(ids.size(), 3UL); - ASSERT_EQ(scores.size(), 3UL); + ASSERT_EQ(ids.size(), 5UL); + ASSERT_EQ(scores.size(), 5UL); - BeamSearchDecoder helper; + BeamSearchDecoder helper(2, 1); // beam_size = 2, end_id = 1 LoDTensor id_tensor; LoDTensor score_tensor; - helper.PackAllSteps(ids, scores, &id_tensor, &score_tensor); + helper.Backtrace(ids, scores, &id_tensor, &score_tensor); LoD lod = id_tensor.lod(); - std::vector expect_source_lod = {0, 4, 8}; + std::vector expect_source_lod = {0, 2, 4}; EXPECT_EQ(lod[0], expect_source_lod); - std::vector expect_sentence_lod = {0, 1, 3, 6, 9, 10, 13, 16, 19}; + std::vector expect_sentence_lod = {0, 4, 7, 12, 17}; EXPECT_EQ(lod[1], expect_sentence_lod); - // 2| 1, 0| 3, 1, 0| 3, 2, 1| 5| 4, 3, 2| 4, 4, 3| 6, 5, 4 - std::vector expect_data = {2, 1, 0, 3, 1, 0, 3, 2, 1, 5, - 4, 3, 2, 4, 4, 3, 6, 5, 4}; + std::vector expect_data = {0, 2, 3, 1, 0, 2, 1, 0, 4, + 5, 3, 5, 0, 4, 5, 3, 1}; ASSERT_EQ(id_tensor.dims()[0], static_cast(expect_data.size())); for (size_t i = 0; i < expect_data.size(); ++i) { ASSERT_EQ(id_tensor.data()[i], diff --git a/paddle/fluid/operators/beam_search_op.cc b/paddle/fluid/operators/beam_search_op.cc index df0b50881f4..62771d09f11 100644 --- a/paddle/fluid/operators/beam_search_op.cc +++ b/paddle/fluid/operators/beam_search_op.cc @@ -12,25 +12,26 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/beam_search_op.h" - #include #include #include #include + #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/beam_search_op.h" namespace paddle { namespace operators { void BeamSearch::operator()(const framework::LoDTensor &pre_ids, + const framework::LoDTensor &pre_scores, framework::LoDTensor *selected_ids, framework::LoDTensor *selected_scores) { auto abs_lod = framework::ToAbsOffset(ids_->lod()); auto &high_level = abs_lod[lod_level_]; - auto items = SelectTopBeamSizeItems(); + auto items = SelectTopBeamSizeItems(pre_ids, pre_scores); auto selected_items = ToMap(items, high_level.back()); VLOG(3) << "selected_items:"; for (size_t i = 0; i < selected_items.size(); ++i) { @@ -39,7 +40,8 @@ void BeamSearch::operator()(const framework::LoDTensor &pre_ids, VLOG(3) << ItemToString(item); } } - PruneEndidCandidates(pre_ids, &selected_items); + + PruneEndBeams(pre_ids, &selected_items); // calculate the output tensor's height size_t num_instances = std::accumulate( std::begin(selected_items), std::end(selected_items), 0, @@ -61,12 +63,6 @@ void BeamSearch::operator()(const framework::LoDTensor &pre_ids, size_t low_offset = 0; for (auto &items : selected_items) { low_level.push_back(low_offset); - sort(items.begin(), items.end(), [](const Item &a, const Item &b) { - if (a.offset < b.offset) { - return true; - } - return a.id < b.id; - }); for (auto &item : items) { ids_data[low_offset] = item.id; scores_data[low_offset] = item.score; @@ -86,21 +82,31 @@ void BeamSearch::operator()(const framework::LoDTensor &pre_ids, selected_scores->set_lod(lod); } -int BeamSearch::PruneEndidCandidates(const framework::LoDTensor &pre_ids, - std::vector> *items) { +void BeamSearch::PruneEndBeams(const framework::LoDTensor &pre_ids, + std::vector> *items) { auto *pre_ids_data = pre_ids.data(); - - int res = 0; - for (size_t offset = 0; offset < items->size(); offset++) { - auto prefix_id = pre_ids_data[offset]; - if (prefix_id == end_id_) { - items->at(offset).clear(); - } else { - res++; + auto abs_lod = framework::ToAbsOffset(ids_->lod()); + auto &high_level = abs_lod[lod_level_]; + for (size_t src_idx = 0; src_idx < high_level.size() - 1; ++src_idx) { + size_t src_prefix_start = high_level[src_idx]; + size_t src_prefix_end = high_level[src_idx + 1]; + bool finish_flag = true; + for (size_t offset = src_prefix_start; offset < src_prefix_end; offset++) { + for (auto &item : items->at(offset)) { + if (item.id != static_cast(end_id_) || + pre_ids_data[offset] != end_id_) { + finish_flag = false; + break; + } + } + if (!finish_flag) break; + } + if (finish_flag) { // all branchs of the beam (source sentence) end and + // prune this beam + for (size_t offset = src_prefix_start; offset < src_prefix_end; offset++) + items->at(offset).clear(); } } - - return res; } std::vector> BeamSearch::ToMap( @@ -115,19 +121,17 @@ std::vector> BeamSearch::ToMap( return result; } -std::vector> -BeamSearch::SelectTopBeamSizeItems() { +std::vector> BeamSearch::SelectTopBeamSizeItems( + const framework::LoDTensor &pre_ids, + const framework::LoDTensor &pre_scores) { std::vector> result; std::vector items; // for each source sentence, select the top beam_size items across all // candidate sets. - while (NextItemSet(&items)) { - std::nth_element(std::begin(items), std::begin(items) + beam_size_, - std::end(items), [](const Item &a, const Item &b) { - // TODO(superjom) make score's comparation customizable. - // partial sort in descending order - return a.score > b.score; - }); + while (NextItemSet(pre_ids, pre_scores, &items)) { + std::nth_element( + std::begin(items), std::begin(items) + beam_size_, std::end(items), + [](const Item &a, const Item &b) { return a.score > b.score; }); // prune the top beam_size items. if (items.size() > beam_size_) { items.resize(beam_size_); @@ -146,7 +150,9 @@ BeamSearch::SelectTopBeamSizeItems() { } // the candidates of a source -bool BeamSearch::NextItemSet(std::vector *items) { +bool BeamSearch::NextItemSet(const framework::LoDTensor &pre_ids, + const framework::LoDTensor &pre_scores, + std::vector *items) { if (sent_offset_ >= ids_->NumElements(lod_level_)) { return false; } @@ -164,14 +170,24 @@ bool BeamSearch::NextItemSet(std::vector *items) { instance_dim *= ids.dims()[i]; } + auto *pre_ids_data = pre_ids.data(); + auto *pre_scores_data = pre_scores.data(); items->clear(); items->reserve(framework::product(ids.dims())); for (size_t offset = abs_lod[lod_level_][sent_offset_]; offset < abs_lod[lod_level_][sent_offset_ + 1]; offset++) { - for (size_t d = 0; d < instance_dim; d++) { - const size_t dim_offset = offset * instance_dim + d; - items->emplace_back(offset, ids_data[dim_offset], - scores_data[dim_offset]); + auto pre_id = pre_ids_data[offset]; + auto pre_score = pre_scores_data[offset]; + if (pre_id == end_id_) { + // Allocate all probability mass to eos_id for finished branchs and the + // other candidate ids can be ignored. + items->emplace_back(offset, end_id_, pre_score); + } else { + for (size_t d = 0; d < instance_dim; d++) { + const size_t dim_offset = offset * instance_dim + d; + items->emplace_back(offset, ids_data[dim_offset], + scores_data[dim_offset]); + } } } @@ -199,15 +215,27 @@ class BeamSearchOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { // inputs and outputs stored in proto - AddInput("pre_ids", "ids in previous step"); - AddInput("ids", "a LoDTensor of shape of [None,k]"); + AddInput("pre_ids", + "(LoDTensor) The LoDTensor containing the selected ids at the " + "previous step. It should be a tensor with shape (batch_size, 1) " + "and lod `[[0, 1, ... , batch_size], [0, 1, ..., batch_size]]` at " + "thefirst step."); + AddInput("pre_scores", + "(LoDTensor) The LoDTensor containing the accumulated " + "scores corresponding to the selected ids at the previous step."); + AddInput("ids", + "(LoDTensor) The LoDTensor containing the candidates ids. Its " + "shape should be (batch_size * beam_size, K), where K supposed to " + "be beam_size."); AddInput("scores", - "a LoDTensor that has the same shape and LoD with `ids`"); + "(LoDTensor) The LodTensor containing the accumulated scores " + "corresponding to Input(ids) and its shape is the same as the " + "shape of Input(ids)."); AddOutput("selected_ids", - "a LoDTensor that stores the IDs selected by beam search"); - AddOutput( - "selected_scores", - "a LoDTensor that has the same shape and LoD with `selected_ids`"); + "A LodTensor that stores the IDs selected by beam search."); + AddOutput("selected_scores", + "A LoDTensor containing the accumulated scores corresponding to " + "Output(selected_ids)."); // Attributes stored in AttributeMap AddAttr("level", "the level of LoDTensor"); @@ -215,8 +243,21 @@ class BeamSearchOpMaker : public framework::OpProtoAndCheckerMaker { AddAttr("end_id", "the token id which indicates the end of a sequence"); - AddComment( - "This is a beam search operator that help to generate sequences."); + AddComment(R"DOC( +This operator does the search in beams for one time step. +Specifically, it selects the top-K candidate word ids of current step from +Input(ids) according to their Input(scores) for all source sentences, +where K is Attr(beam_size) and Input(ids), Input(scores) are predicted results +from the computation cell. Additionally, Input(pre_ids) and Input(pre_scores) +are the output of beam_search at previous step, they are needed for special use +to handle ended candidate translations. The paths linking prefixes and selected +candidates are organized and reserved in lod. + +Note that the Input(scores) passed in should be accumulated scores, and +length penalty should be done with extra operators before calculating the +accumulated scores if needed, also suggest finding top-K before it and +using the top-K candidates following. +)DOC"); } }; @@ -253,10 +294,12 @@ class BeamSearchInferVarType : public framework::VarTypeInference { void operator()(const framework::OpDesc &op_desc, framework::BlockDesc *block) const override { for (auto &o : op_desc.Output("selected_ids")) { - block->Var(o)->SetType(framework::proto::VarType::LOD_TENSOR); + auto &selected_ids = block->FindRecursiveOrCreateVar(o); + selected_ids.SetType(framework::proto::VarType::LOD_TENSOR); } for (auto &o : op_desc.Output("selected_scores")) { - block->Var(o)->SetType(framework::proto::VarType::LOD_TENSOR); + auto &selected_scores = block->FindRecursiveOrCreateVar(o); + selected_scores.SetType(framework::proto::VarType::LOD_TENSOR); } } }; diff --git a/paddle/fluid/operators/beam_search_op.h b/paddle/fluid/operators/beam_search_op.h index 46bc4f6f936..b5e2ed05924 100644 --- a/paddle/fluid/operators/beam_search_op.h +++ b/paddle/fluid/operators/beam_search_op.h @@ -132,6 +132,7 @@ class BeamSearch { * that means no candidates is provided, and the task will stop running. */ void operator()(const framework::LoDTensor& pre_ids, + const framework::LoDTensor& pre_scores, framework::LoDTensor* selected_ids, framework::LoDTensor* selected_scores); /* @@ -153,14 +154,16 @@ class BeamSearch { protected: /* - * Delete all the records that follows the end token. + * Prune the source sentences all branchs finished, and it is optional. + * Pruning must one step later than finishing (thus pre_ids is needed here), + * since the end tokens must be writed out. */ - int PruneEndidCandidates(const framework::LoDTensor& pre_ids, - std::vector>* items); + void PruneEndBeams(const framework::LoDTensor& pre_ids, + std::vector>* items); /* * Transform the items into a map whose key is offset, value is the items. - * NOTE low performance + * NOTE low performance. */ std::vector> ToMap( const std::vector>& inputs, size_t element_num); @@ -168,12 +171,16 @@ class BeamSearch { /* * For each source, select top beam_size records. */ - std::vector> SelectTopBeamSizeItems(); + std::vector> SelectTopBeamSizeItems( + const framework::LoDTensor& pre_ids, + const framework::LoDTensor& pre_scores); /* * Get the items of next source sequence, return false if no remaining items. */ - bool NextItemSet(std::vector* items); + bool NextItemSet(const framework::LoDTensor& pre_ids, + const framework::LoDTensor& pre_scores, + std::vector* items); private: size_t beam_size_; @@ -192,24 +199,25 @@ template class BeamSearchOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { - auto* ids_var = context.Input("ids"); - auto* scores_var = context.Input("scores"); - auto* pre_ids_var = context.Input("pre_ids"); - PADDLE_ENFORCE_NOT_NULL(ids_var); - PADDLE_ENFORCE_NOT_NULL(scores_var); - PADDLE_ENFORCE_NOT_NULL(pre_ids_var); + auto* ids = context.Input("ids"); + auto* scores = context.Input("scores"); + auto* pre_ids = context.Input("pre_ids"); + auto* pre_scores = context.Input("pre_scores"); + PADDLE_ENFORCE_NOT_NULL(ids); + PADDLE_ENFORCE_NOT_NULL(scores); + PADDLE_ENFORCE_NOT_NULL(pre_ids); + PADDLE_ENFORCE_NOT_NULL(pre_scores); size_t level = context.Attr("level"); size_t beam_size = context.Attr("beam_size"); int end_id = context.Attr("end_id"); - BeamSearch alg(*ids_var, *scores_var, level, beam_size, end_id); - auto selected_ids_var = - context.Output("selected_ids"); - auto selected_scores_var = + BeamSearch alg(*ids, *scores, level, beam_size, end_id); + auto selected_ids = context.Output("selected_ids"); + auto selected_scores = context.Output("selected_scores"); - PADDLE_ENFORCE_NOT_NULL(selected_ids_var); - PADDLE_ENFORCE_NOT_NULL(selected_scores_var); - alg(*pre_ids_var, selected_ids_var, selected_scores_var); + PADDLE_ENFORCE_NOT_NULL(selected_ids); + PADDLE_ENFORCE_NOT_NULL(selected_scores); + alg(*pre_ids, *pre_scores, selected_ids, selected_scores); } }; } // namespace operators diff --git a/paddle/fluid/operators/beam_search_op_test.cc b/paddle/fluid/operators/beam_search_op_test.cc index ec666359aa2..c4f4b478fbf 100644 --- a/paddle/fluid/operators/beam_search_op_test.cc +++ b/paddle/fluid/operators/beam_search_op_test.cc @@ -30,7 +30,7 @@ using std::endl; void CreateInput(LoDTensor* ids, LoDTensor* scores) { LoD lod; - vector level0({0, 1, 4}); + vector level0({0, 2, 4}); vector level1({0, 1, 2, 3, 4}); lod.push_back(level0); lod.push_back(level1); @@ -64,17 +64,22 @@ TEST(beam_search_op, run) { for (int i = 0; i < 4; i++) { pre_ids.mutable_data(place)[i] = i + 1; } + LoDTensor pre_scores; + pre_scores.Resize(framework::make_ddim(vector(4, 1))); + for (int i = 0; i < 4; i++) { + pre_scores.mutable_data(place)[i] = 0.1 * (i + 1); + } - BeamSearch beamsearch(ids, scores, (int64_t)0, (int64_t)2, 0); + BeamSearch beamsearch(ids, scores, (size_t)0, (size_t)2, 0); LoDTensor sids, sscores; - beamsearch(pre_ids, &sids, &sscores); + beamsearch(pre_ids, pre_scores, &sids, &sscores); LOG(INFO) << "score: " << sscores << endl; ASSERT_EQ(sids.lod(), sscores.lod()); - vector tids({2, 4, 3, 8}); - vector tscores({0.3, 0.5, 0.9, 0.7}); + vector tids({4, 2, 3, 8}); + vector tscores({0.5, 0.6, 0.9, 0.7}); for (int i = 0; i < 4; i++) { ASSERT_EQ(tids[i], sids.data()[i]); diff --git a/paddle/fluid/operators/tensor_array_read_write_op.cc b/paddle/fluid/operators/tensor_array_read_write_op.cc index c703d11eecc..a2d44284e9d 100644 --- a/paddle/fluid/operators/tensor_array_read_write_op.cc +++ b/paddle/fluid/operators/tensor_array_read_write_op.cc @@ -38,15 +38,14 @@ class WriteToArrayOp : public ArrayOp { << " to " << offset + 1; out->resize(offset + 1); } + auto *out_tensor = &out->at(offset); + out_tensor->set_lod(x_tensor.lod()); if (x_tensor.memory_size() > 0) { - auto *out_tensor = &out->at(offset); - platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); auto &dev_ctx = *pool.Get(place); TensorCopy(x_tensor, place, dev_ctx, out_tensor); - out_tensor->set_lod(x_tensor.lod()); } else { VLOG(10) << "WARNING: The input tensor 'x_tensor' holds no memory, so " "nothing has been written to output array[" diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 2979ff3057a..9e8d5f6cd35 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -2223,56 +2223,6 @@ def layer_norm(input, return helper.append_activation(layer_norm_out) -def beam_search_decode(ids, scores, name=None): - """ - Beam Search Decode - - This layers is to pack the output of beam search layer into sentences and - associated scores. It is usually called after the beam search layer. - Typically, the output of beam search layer is a tensor of selected ids, with - a tensor of the score of each id. Beam search layer's output ids, however, - are generated directly during the tree search, and they are stacked by each - level of the search tree. Thus we need to reorganize them into sentences, - based on the score of each id. This layer takes the output of beam search - layer as input and repack them into sentences. - - Args: - ids (Variable): The selected ids, output of beam search layer. - scores (Variable): The associated scores of the ids, out put of beam - search layer. - name (str): The name of this layer. It is optional. - - Returns: - tuple(Variable): a tuple of two output tensors: sentence_ids, sentence_scores. - sentence_ids is a tensor with shape [size, length], where size is the - beam size of beam search, and length is the length of each sentence. - Note that the length of sentences may vary. - sentence_scores is a tensor with the same shape as sentence_ids. - - Examples: - .. code-block:: python - - ids, scores = fluid.layers.beam_search( - pre_ids, ids, scores, beam_size, end_id) - sentence_ids, sentence_scores = fluid.layers.beam_search_decode( - ids, scores) - """ - helper = LayerHelper('beam_search_decode', **locals()) - sentence_ids = helper.create_tmp_variable(dtype=ids.dtype) - sentence_scores = helper.create_tmp_variable(dtype=ids.dtype) - - helper.append_op( - type="beam_search_decode", - inputs={"Ids": ids, - "Scores": scores}, - outputs={ - "SentenceIds": sentence_ids, - "SentenceScores": sentence_scores - }) - - return sentence_ids, sentence_scores - - def conv2d_transpose(input, num_filters, output_size=None, @@ -2676,38 +2626,89 @@ def sequence_expand(x, y, ref_level=-1, name=None): return tmp -def beam_search(pre_ids, ids, scores, beam_size, end_id, level=0): - ''' - **beam search** - - This function implements the beam search algorithm. - - Beam search is a classical algorithm for selecting candidate words - in a machine translation task. +def beam_search(pre_ids, + pre_scores, + ids, + scores, + beam_size, + end_id, + level=0, + name=None): + """ + Beam search is a classical algorithm for selecting candidate words in a + machine translation task. Refer to `Beam search `_ for more details. + + This layer does the search in beams for one time step. Specifically, it + selects the top-K candidate word ids of current step from :attr:`ids` + according to their :attr:`scores` for all source sentences, where K is + :attr:`beam_size` and :attr:`ids, scores` are predicted results from the + computation cell. Additionally, :attr:`pre_ids` and :attr:`pre_scores` are + the output of beam_search at previous step, they are needed for special use + to handle ended candidate translations. + + Note that the :attr:`scores` passed in should be accumulated scores, and + length penalty should be done with extra operators before calculating the + accumulated scores if needed, also suggest finding top-K before it and + using the top-K candidates following. + + Please see the following demo for a fully beam search usage example: + + fluid/tests/book/test_machine_translation.py Args: - pre_ids (Variable): ids in previous step. - ids (Variable): a LoDTensor of shape of [None,k] - scores (Variable): a LoDTensor that has the same shape and LoD with `ids` - beam_size (int): beam size for beam search - end_id (int): the token id which indicates the end of a sequence - level (int): the level of LoDTensor + pre_ids(Variable): The LodTensor variable which is the output of + beam_search at previous step. It should be a LodTensor with shape + :math:`(batch_size, 1)` and lod + :math:`[[0, 1, ... , batch_size], [0, 1, ..., batch_size]]` at the + first step. + pre_scores(Variable): The LodTensor variable which is the output of + beam_search at previous step. + ids(Variable): The LodTensor variable containing the candidates ids. + Its shape should be :math:`(batch_size \\times beam_size, K)`, + where :math:`K` supposed to be :attr:`beam_size`. + scores(Variable): The LodTensor variable containing the accumulated + scores corresponding to :attr:`ids` and its shape is the same as + the shape of :attr:`ids`. + beam_size(int): The beam width used in beam search. + end_id(int): The id of end token. + level(int, default 0): It can be ignored and mustn't change currently. + It means the source level of lod, which is explained as following. + The lod level of :attr:`ids` should be 2. The first level is source + level which describes how many prefixes (branchs) for each source + sentece (beam), and the second level is sentence level which + describes how these candidates belong to the prefix. The paths + linking prefixes and selected candidates are organized and reserved + in lod. + name(str|None): A name for this layer(optional). If set None, the layer + will be named automatically. Returns: - tuple: a tuple of beam_search output variables: `selected_ids`, `selected_scores` + Variable: The LodTensor pair containing the selected ids and the \ + corresponding scores. Examples: .. code-block:: python - # current_score is a Tensor of shape (num_batch_size, embed_size), which - # consists score of each candidate word. - topk_scores, topk_indices = pd.topk(current_score, k=50) - selected_ids, selected_scores = pd.beam_search( - pre_ids, topk_indices, topk_scores, beam_size, end_id=10, level=0) - ''' + # Suppose `probs` contains predicted results from the computation + # cell and `pre_ids` and `pre_scores` is the output of beam_search + # at previous step. + topk_scores, topk_indices = layers.topk(probs, k=beam_size) + accu_scores = layers.elementwise_add( + x=layers.log(x=topk_scores)), + y=layers.reshape( + pre_scores, shape=[-1]), + axis=0) + selected_ids, selected_scores = layers.beam_search( + pre_ids=pre_ids, + pre_scores=pre_scores, + ids=topk_indices, + scores=accu_scores, + beam_size=beam_size, + end_id=end_id) + """ helper = LayerHelper('beam_search', **locals()) score_type = scores.dtype id_type = ids.dtype @@ -2719,6 +2720,7 @@ def beam_search(pre_ids, ids, scores, beam_size, end_id, level=0): type='beam_search', inputs={ 'pre_ids': pre_ids, + 'pre_scores': pre_scores, 'ids': ids, 'scores': scores, }, @@ -2736,6 +2738,56 @@ def beam_search(pre_ids, ids, scores, beam_size, end_id, level=0): return selected_ids, selected_scores +def beam_search_decode(ids, scores, beam_size, end_id, name=None): + """ + Beam Search Decode Layer. This layer constructs the full hypotheses for + each source sentence by walking back along the LoDTensorArray :attr:`ids` + whose lods can be used to restore the path in the beam search tree. + Please see the following demo for a fully beam search usage example: + fluid/tests/book/test_machine_translation.py + + Args: + ids(Variable): The LodTensorArray variable containing the selected ids + of all steps. + scores(Variable): The LodTensorArray variable containing the selected + scores of all steps. + beam_size(int): The beam width used in beam search. + end_id(int): The id of end token. + name(str|None): A name for this layer(optional). If set None, the layer + will be named automatically. + + Returns: + Variable: The LodTensor pair containing the generated id sequences \ + and the corresponding scores. The shapes and lods of the two \ + LodTensor are same. The lod level is 2 and the two levels \ + separately indicate how many hypotheses each source sentence has \ + and how many ids each hypothesis has. + + Examples: + .. code-block:: python + # Suppose `ids` and `scores` are LodTensorArray variables reserving + # the selected ids and scores of all steps + finished_ids, finished_scores = layers.beam_search_decode( + ids, scores, beam_size=5, end_id=0) + """ + helper = LayerHelper('beam_search_decode', **locals()) + sentence_ids = helper.create_tmp_variable(dtype=ids.dtype) + sentence_scores = helper.create_tmp_variable(dtype=ids.dtype) + + helper.append_op( + type="beam_search_decode", + inputs={"Ids": ids, + "Scores": scores}, + outputs={ + "SentenceIds": sentence_ids, + "SentenceScores": sentence_scores + }, + attrs={"beam_size": beam_size, + "end_id": end_id}) + + return sentence_ids, sentence_scores + + def lstm_unit(x_t, hidden_t_prev, cell_t_prev, diff --git a/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py b/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py index c4b37df3a09..f690a0d2337 100644 --- a/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py +++ b/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py @@ -127,9 +127,19 @@ def decode(context, is_sparse): current_score = pd.fc(input=current_state_with_lod, size=target_dict_dim, act='softmax') - topk_scores, topk_indices = pd.topk(current_score, k=topk_size) + topk_scores, topk_indices = pd.topk(current_score, k=beam_size) + # calculate accumulated scores after topk to reduce computation cost + accu_scores = pd.elementwise_add( + x=pd.log(topk_scores), y=pd.reshape( + pre_score, shape=[-1]), axis=0) selected_ids, selected_scores = pd.beam_search( - pre_ids, topk_indices, topk_scores, beam_size, end_id=10, level=0) + pre_ids, + pre_score, + topk_indices, + accu_scores, + beam_size, + end_id=10, + level=0) pd.increment(x=counter, value=1, in_place=True) @@ -138,10 +148,14 @@ def decode(context, is_sparse): pd.array_write(selected_ids, array=ids_array, i=counter) pd.array_write(selected_scores, array=scores_array, i=counter) - pd.less_than(x=counter, y=array_len, cond=cond) + # update the break condition: up to the max length or all candidates of + # source sentences have ended. + length_cond = pd.less_than(x=counter, y=array_len) + finish_cond = pd.logical_not(pd.is_empty(x=selected_ids)) + pd.logical_and(x=length_cond, y=finish_cond, out=cond) translation_ids, translation_scores = pd.beam_search_decode( - ids=ids_array, scores=scores_array) + ids=ids_array, scores=scores_array, beam_size=beam_size, end_id=10) # return init_ids, init_scores diff --git a/python/paddle/fluid/tests/book/test_machine_translation.py b/python/paddle/fluid/tests/book/test_machine_translation.py index 372d6ec8223..5238203317a 100644 --- a/python/paddle/fluid/tests/book/test_machine_translation.py +++ b/python/paddle/fluid/tests/book/test_machine_translation.py @@ -126,9 +126,19 @@ def decoder_decode(context, is_sparse): current_score = pd.fc(input=current_state_with_lod, size=target_dict_dim, act='softmax') - topk_scores, topk_indices = pd.topk(current_score, k=50) + topk_scores, topk_indices = pd.topk(current_score, k=beam_size) + # calculate accumulated scores after topk to reduce computation cost + accu_scores = pd.elementwise_add( + x=pd.log(topk_scores), y=pd.reshape( + pre_score, shape=[-1]), axis=0) selected_ids, selected_scores = pd.beam_search( - pre_ids, topk_indices, topk_scores, beam_size, end_id=10, level=0) + pre_ids, + pre_score, + topk_indices, + accu_scores, + beam_size, + end_id=10, + level=0) pd.increment(x=counter, value=1, in_place=True) @@ -137,10 +147,14 @@ def decoder_decode(context, is_sparse): pd.array_write(selected_ids, array=ids_array, i=counter) pd.array_write(selected_scores, array=scores_array, i=counter) - pd.less_than(x=counter, y=array_len, cond=cond) + # update the break condition: up to the max length or all candidates of + # source sentences have ended. + length_cond = pd.less_than(x=counter, y=array_len) + finish_cond = pd.logical_not(pd.is_empty(x=selected_ids)) + pd.logical_and(x=length_cond, y=finish_cond, out=cond) translation_ids, translation_scores = pd.beam_search_decode( - ids=ids_array, scores=scores_array) + ids=ids_array, scores=scores_array, beam_size=beam_size, end_id=10) # return init_ids, init_scores diff --git a/python/paddle/fluid/tests/unittests/test_beam_search_decode_op.py b/python/paddle/fluid/tests/unittests/test_beam_search_decode_op.py index 4e1687477c6..db5771f7b0a 100644 --- a/python/paddle/fluid/tests/unittests/test_beam_search_decode_op.py +++ b/python/paddle/fluid/tests/unittests/test_beam_search_decode_op.py @@ -20,44 +20,58 @@ from paddle.fluid.op import Operator class TestBeamSearchDecodeOp(unittest.TestCase): + """unittest of beam_search_decode_op""" + def setUp(self): self.scope = core.Scope() self.place = core.CPUPlace() def append_lod_tensor(self, tensor_array, lod, data): lod_tensor = core.LoDTensor() - lod_tensor.set_recursive_sequence_lengths(lod) + lod_tensor.set_lod(lod) lod_tensor.set(data, self.place) tensor_array.append(lod_tensor) def test_get_set(self): ids = self.scope.var("ids").get_lod_tensor_array() - self.append_lod_tensor( - ids, [[3, 3], [1, 1, 1, 1, 1, 1]], - np.array( - [1, 2, 3, 4, 5, 6], dtype="int64")) - self.append_lod_tensor( - ids, [[3, 3], [1, 0, 2, 2, 0, 1]], - np.array( - [0, 1, 2, 3, 4, 5], dtype="int64")) - self.append_lod_tensor( - ids, [[3, 3], [0, 1, 1, 1, 1, 1]], - np.array( - [0, 1, 2, 3, 4], dtype="int64")) - scores = self.scope.var("scores").get_lod_tensor_array() - self.append_lod_tensor( - scores, [[3, 3], [1, 1, 1, 1, 1, 1]], - np.array( - [1, 2, 3, 4, 5, 6], dtype="float64")) - self.append_lod_tensor( - scores, [[3, 3], [1, 0, 2, 2, 0, 1]], - np.array( - [0, 1, 2, 3, 4, 5], dtype="float64")) - self.append_lod_tensor( - scores, [[3, 3], [0, 1, 1, 1, 1, 1]], - np.array( - [0, 1, 2, 3, 4], dtype="float64")) + # Construct sample data with 5 steps and 2 source sentences + # beam_size = 2, end_id = 1 + # start with start_id + [ + self.append_lod_tensor( + array, [[0, 1, 2], [0, 1, 2]], np.array( + [0, 0], dtype=dtype)) + for array, dtype in ((ids, "int64"), (scores, "float32")) + ] + [ + self.append_lod_tensor( + array, [[0, 1, 2], [0, 2, 4]], + np.array( + [2, 3, 4, 5], dtype=dtype)) + for array, dtype in ((ids, "int64"), (scores, "float32")) + ] + [ + self.append_lod_tensor( + array, [[0, 2, 4], [0, 2, 2, 4, 4]], + np.array( + [3, 1, 5, 4], dtype=dtype)) + for array, dtype in ((ids, "int64"), (scores, "float32")) + ] + [ + self.append_lod_tensor( + array, [[0, 2, 4], [0, 1, 2, 3, 4]], + np.array( + [1, 1, 3, 5], dtype=dtype)) + for array, dtype in ((ids, "int64"), (scores, "float32")) + ] + [ + self.append_lod_tensor( + array, [[0, 2, 4], [0, 0, 0, 2, 2]], + np.array( + [5, 1], dtype=dtype)) + for array, dtype in ((ids, "int64"), (scores, "float32")) + ] sentence_ids = self.scope.var("sentence_ids").get_tensor() sentence_scores = self.scope.var("sentence_scores").get_tensor() @@ -69,18 +83,18 @@ class TestBeamSearchDecodeOp(unittest.TestCase): Scores="scores", # outputs SentenceIds="sentence_ids", - SentenceScores="sentence_scores") + SentenceScores="sentence_scores", + beam_size=2, + end_id=1, ) beam_search_decode_op.run(self.scope, self.place) - expected_lod = [[4, 4], [1, 2, 3, 3, 1, 3, 3, 3]] - self.assertEqual(sentence_ids.recursive_sequence_lengths(), - expected_lod) - self.assertEqual(sentence_scores.recursive_sequence_lengths(), - expected_lod) + expected_lod = [[0, 2, 4], [0, 4, 7, 12, 17]] + self.assertEqual(sentence_ids.lod(), expected_lod) + self.assertEqual(sentence_scores.lod(), expected_lod) expected_data = np.array( - [2, 1, 0, 3, 1, 0, 3, 2, 1, 5, 4, 3, 2, 4, 4, 3, 6, 5, 4], "int64") + [0, 2, 3, 1, 0, 2, 1, 0, 4, 5, 3, 5, 0, 4, 5, 3, 1], "int64") self.assertTrue(np.array_equal(np.array(sentence_ids), expected_data)) self.assertTrue( np.array_equal(np.array(sentence_scores), expected_data)) diff --git a/python/paddle/fluid/tests/unittests/test_beam_search_op.py b/python/paddle/fluid/tests/unittests/test_beam_search_op.py index 5a14178c278..167451edd8c 100644 --- a/python/paddle/fluid/tests/unittests/test_beam_search_op.py +++ b/python/paddle/fluid/tests/unittests/test_beam_search_op.py @@ -26,9 +26,12 @@ def create_tensor(scope, name, np_data): class BeamSearchOpTester(unittest.TestCase): + """unittest of beam_search_op""" + def setUp(self): self.scope = core.Scope() self._create_ids() + self._create_pre_scores() self._create_scores() self._create_pre_ids() self.scope.var('selected_ids') @@ -37,7 +40,8 @@ class BeamSearchOpTester(unittest.TestCase): def test_run(self): op = Operator( 'beam_search', - pre_ids="pre_ids", + pre_ids='pre_ids', + pre_scores='pre_scores', ids='ids', scores='scores', selected_ids='selected_ids', @@ -47,19 +51,31 @@ class BeamSearchOpTester(unittest.TestCase): end_id=0, ) op.run(self.scope, core.CPUPlace()) selected_ids = self.scope.find_var("selected_ids").get_tensor() - print 'selected_ids', np.array(selected_ids) - print 'lod', selected_ids.recursive_sequence_lengths() + selected_scores = self.scope.find_var("selected_scores").get_tensor() + self.assertTrue( + np.allclose( + np.array(selected_ids), np.array([4, 2, 3, 8])[:, np.newaxis])) + self.assertTrue( + np.allclose( + np.array(selected_scores), + np.array([0.5, 0.6, 0.9, 0.7])[:, np.newaxis])) + self.assertEqual(selected_ids.lod(), + [[0L, 2L, 4L], [0L, 1L, 2L, 3L, 4L]]) def _create_pre_ids(self): np_data = np.array([[1, 2, 3, 4]], dtype='int64') - tensor = create_tensor(self.scope, "pre_ids", np_data) + tensor = create_tensor(self.scope, 'pre_ids', np_data) + + def _create_pre_scores(self): + np_data = np.array([[0.1, 0.2, 0.3, 0.4]], dtype='float32') + tensor = create_tensor(self.scope, 'pre_scores', np_data) def _create_ids(self): - self.lod = [[1, 3], [1, 1, 1, 1]] + self.lod = [[0, 2, 4], [0, 1, 2, 3, 4]] np_data = np.array( [[4, 2, 5], [2, 1, 3], [3, 5, 2], [8, 2, 1]], dtype='int64') tensor = create_tensor(self.scope, "ids", np_data) - tensor.set_recursive_sequence_lengths(self.lod) + tensor.set_lod(self.lod) def _create_scores(self): np_data = np.array( @@ -71,7 +87,7 @@ class BeamSearchOpTester(unittest.TestCase): ], dtype='float32') tensor = create_tensor(self.scope, "scores", np_data) - tensor.set_recursive_sequence_lengths(self.lod) + tensor.set_lod(self.lod) if __name__ == '__main__': -- GitLab