From 16d54f7f23cac51988de6937cfdf3d3f66991afa Mon Sep 17 00:00:00 2001 From: Yiqun Liu Date: Wed, 30 Jan 2019 11:24:45 +0800 Subject: [PATCH] Return parent_idx in beam_search op (#15520) * Refine beam_search_op to output an extra parent_idx tensor. test=develop * Fix the unittest test_beam_search_op. test=develop * Fix the merging mistake. test=develop --- paddle/fluid/API.spec | 2 +- paddle/fluid/operators/beam_search_op.cc | 3 + paddle/fluid/operators/beam_search_op.h | 6 +- paddle/fluid/operators/gather_op.cu | 5 +- paddle/fluid/operators/gather_op.h | 4 +- paddle/fluid/operators/math/beam_search.cc | 8 ++- paddle/fluid/operators/math/beam_search.cu | 68 ++++++++++--------- paddle/fluid/operators/math/beam_search.h | 14 ++-- .../fluid/operators/math/beam_search_test.cc | 3 +- python/paddle/fluid/layers/nn.py | 25 +++++-- .../tests/unittests/test_beam_search_op.py | 5 ++ 11 files changed, 88 insertions(+), 55 deletions(-) diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 349460ad98e..fe8d6dd4259 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -122,7 +122,7 @@ paddle.fluid.layers.transpose ArgSpec(args=['x', 'perm', 'name'], varargs=None, paddle.fluid.layers.im2sequence ArgSpec(args=['input', 'filter_size', 'stride', 'padding', 'input_image_size', 'out_stride', 'name'], varargs=None, keywords=None, defaults=(1, 1, 0, None, 1, None)) paddle.fluid.layers.nce ArgSpec(args=['input', 'label', 'num_total_classes', 'sample_weight', 'param_attr', 'bias_attr', 'num_neg_samples', 'name', 'sampler', 'custom_dist', 'seed', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 'uniform', None, 0, False)) paddle.fluid.layers.hsigmoid ArgSpec(args=['input', 'label', 'num_classes', 'param_attr', 'bias_attr', 'name', 'path_table', 'path_code', 'is_custom', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, False, False)) -paddle.fluid.layers.beam_search ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 'scores', 'beam_size', 'end_id', 'level', 'is_accumulated', 'name'], varargs=None, keywords=None, defaults=(0, True, None)) +paddle.fluid.layers.beam_search ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 'scores', 'beam_size', 'end_id', 'level', 'is_accumulated', 'name', 'return_parent_idx'], varargs=None, keywords=None, defaults=(0, True, None, False)) paddle.fluid.layers.row_conv ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.layers.multiplex ArgSpec(args=['inputs', 'index'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.layer_norm ArgSpec(args=['input', 'scale', 'shift', 'begin_norm_axis', 'epsilon', 'param_attr', 'bias_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(True, True, 1, 1e-05, None, None, None, None)) diff --git a/paddle/fluid/operators/beam_search_op.cc b/paddle/fluid/operators/beam_search_op.cc index e78ecc1a123..e93cd8615e0 100644 --- a/paddle/fluid/operators/beam_search_op.cc +++ b/paddle/fluid/operators/beam_search_op.cc @@ -51,6 +51,9 @@ class BeamSearchOpMaker : public framework::OpProtoAndCheckerMaker { AddOutput("selected_scores", "A LoDTensor containing the accumulated scores corresponding to " "Output(selected_ids)."); + AddOutput( + "parent_idx", + "A Tensor preserving the selected_ids' parent indice in pre_ids."); // Attributes stored in AttributeMap AddAttr("level", "the level of LoDTensor"); diff --git a/paddle/fluid/operators/beam_search_op.h b/paddle/fluid/operators/beam_search_op.h index 1b939e742de..f808020cc76 100644 --- a/paddle/fluid/operators/beam_search_op.h +++ b/paddle/fluid/operators/beam_search_op.h @@ -41,13 +41,15 @@ class BeamSearchOpKernel : public framework::OpKernel { auto selected_ids = context.Output("selected_ids"); auto selected_scores = context.Output("selected_scores"); + auto* parent_idx = context.Output("parent_idx"); PADDLE_ENFORCE_NOT_NULL(selected_ids); PADDLE_ENFORCE_NOT_NULL(selected_scores); + PADDLE_ENFORCE_NOT_NULL(parent_idx); math::BeamSearchFunctor alg; alg(context.template device_context(), pre_ids, pre_scores, - ids, scores, selected_ids, selected_scores, level, beam_size, end_id, - is_accumulated); + ids, scores, selected_ids, selected_scores, parent_idx, level, + beam_size, end_id, is_accumulated); } }; diff --git a/paddle/fluid/operators/gather_op.cu b/paddle/fluid/operators/gather_op.cu index 9f4aef08cd5..427ac61858e 100644 --- a/paddle/fluid/operators/gather_op.cu +++ b/paddle/fluid/operators/gather_op.cu @@ -31,7 +31,7 @@ class GatherOpCUDAKernel : public framework::OpKernel { auto *output = ctx.Output("Out"); output->mutable_data(ctx.GetPlace()); - + if (x->numel() == 0) return; GPUGather(ctx.device_context(), *x, *index, output); } }; @@ -45,14 +45,13 @@ class GatherGradOpCUDAKernel : public framework::OpKernel { auto *Index = ctx.Input("Index"); auto *dX = ctx.Output(framework::GradVarName("X")); auto *dO = ctx.Input(framework::GradVarName("Out")); - auto *x = ctx.Input("X"); dX->mutable_data(ctx.GetPlace()); auto dxt = framework::EigenVector::Flatten(*dX); auto &place = *ctx.template device_context() .eigen_device(); dxt.device(place) = dxt.constant(static_cast(0)); - + if (dO->numel() == 0) return; GPUScatterAssign(ctx.device_context(), *dO, *Index, dX); } }; diff --git a/paddle/fluid/operators/gather_op.h b/paddle/fluid/operators/gather_op.h index 2dd726bebb1..2e18298cf8e 100644 --- a/paddle/fluid/operators/gather_op.h +++ b/paddle/fluid/operators/gather_op.h @@ -35,7 +35,7 @@ class GatherOpKernel : public framework::OpKernel { auto *output = ctx.Output("Out"); output->mutable_data(ctx.GetPlace()); - + if (x->numel() == 0) return; CPUGather(ctx.device_context(), *x, *index, output); } }; @@ -56,7 +56,7 @@ class GatherGradientOpKernel : public framework::OpKernel { auto &place = *ctx.template device_context() .eigen_device(); dxt.device(place) = dxt.constant(static_cast(0)); - + if (dO->numel() == 0) return; ScatterAssign(ctx.device_context(), *dO, *Index, dX); } }; diff --git a/paddle/fluid/operators/math/beam_search.cc b/paddle/fluid/operators/math/beam_search.cc index fb7119273a7..69971ef7423 100644 --- a/paddle/fluid/operators/math/beam_search.cc +++ b/paddle/fluid/operators/math/beam_search.cc @@ -29,8 +29,9 @@ class BeamSearchFunctor { const framework::LoDTensor *ids, const framework::LoDTensor *scores, framework::LoDTensor *selected_ids, - framework::LoDTensor *selected_scores, size_t level, - size_t beam_size, int end_id, bool is_accumulated) { + framework::LoDTensor *selected_scores, + framework::Tensor *parent_idx, size_t level, size_t beam_size, + int end_id, bool is_accumulated) { auto abs_lod = framework::ToAbsOffset(scores->lod()); auto &high_level = abs_lod[level]; @@ -57,11 +58,13 @@ class BeamSearchFunctor { std::vector({static_cast(num_instances), 1})); selected_ids->Resize(dims); selected_scores->Resize(dims); + parent_idx->Resize({static_cast(num_instances)}); auto *selected_ids_data = selected_ids->mutable_data(platform::CPUPlace()); auto *selected_scores_data = selected_scores->mutable_data(platform::CPUPlace()); + auto *parent_idx_data = parent_idx->mutable_data(platform::CPUPlace()); // fill in data std::vector low_level; @@ -69,6 +72,7 @@ class BeamSearchFunctor { for (auto &items : selected_items) { low_level.push_back(low_offset); for (auto &item : items) { + parent_idx_data[low_offset] = static_cast(low_level.size() - 1); selected_ids_data[low_offset] = item.id; selected_scores_data[low_offset] = item.score; low_offset++; diff --git a/paddle/fluid/operators/math/beam_search.cu b/paddle/fluid/operators/math/beam_search.cu index d94e3023ce5..61d021ef627 100644 --- a/paddle/fluid/operators/math/beam_search.cu +++ b/paddle/fluid/operators/math/beam_search.cu @@ -157,10 +157,10 @@ __device__ __forceinline__ bool PruneEndBeams(Triple* top_beam_local, } __device__ __forceinline__ void WriteBack( - int64_t* selected_ids, float* selected_scores, size_t* selected_offsets, - Triple* top_beam_local, const int seq_offset_start, - const int seq_offset_end, const int selected_seq_start, - const int selected_seq_length) { + int64_t* selected_ids, float* selected_scores, int* parent_idx, + size_t* selected_offsets, Triple* top_beam_local, + const int seq_offset_start, const int seq_offset_end, + const int selected_seq_start, const int selected_seq_length) { const int tid = threadIdx.x; // use 1 thread only for each sequence int global_index = selected_seq_start; for (int global_offset = seq_offset_start; global_offset < seq_offset_end; @@ -171,6 +171,7 @@ __device__ __forceinline__ void WriteBack( selected_ids[global_index] = static_cast(top_beam_local[local_index].id); selected_scores[global_index] = top_beam_local[local_index].score; + parent_idx[global_index] = static_cast(global_offset); global_index++; } } @@ -180,11 +181,11 @@ __device__ __forceinline__ void WriteBack( template __device__ void BeamSearchDetails( - int64_t* selected_ids, float* selected_scores, size_t* selected_offsets, - const int64_t* pre_ids, const float* pre_scores, const int64_t* ids, - const float* scores, const int seq_offset_start, const int seq_offset_end, - const int seq_width, int beam_size, int end_id, bool is_accumulated, - int num_used_threads) { + int64_t* selected_ids, float* selected_scores, int* parent_idx, + size_t* selected_offsets, const int64_t* pre_ids, const float* pre_scores, + const int64_t* ids, const float* scores, const int seq_offset_start, + const int seq_offset_end, const int seq_width, int beam_size, int end_id, + bool is_accumulated, int num_used_threads) { __shared__ Triple top_beam[MaxLength]; int num_items = 0; @@ -228,15 +229,15 @@ __device__ void BeamSearchDetails( selected_offsets[0] = 0; } - WriteBack(selected_ids, selected_scores, selected_offsets, top_beam_local, - seq_offset_start, seq_offset_end, selected_seq_start, - selected_seq_length); + WriteBack(selected_ids, selected_scores, parent_idx, selected_offsets, + top_beam_local, seq_offset_start, seq_offset_end, + selected_seq_start, selected_seq_length); } } template __global__ void BeamSearchKernel(int64_t* selected_ids, float* selected_scores, - size_t* selected_offsets, + int* parent_idx, size_t* selected_offsets, const int64_t* pre_ids, const float* pre_scores, const int64_t* ids, const float* scores, const size_t* seq_offsets, @@ -250,24 +251,25 @@ __global__ void BeamSearchKernel(int64_t* selected_ids, float* selected_scores, int seq_offset_end = static_cast(seq_offsets[seq_id + 1]); BeamSearchDetails( - selected_ids, selected_scores, selected_offsets, pre_ids, pre_scores, ids, - scores, seq_offset_start, seq_offset_end, seq_width, beam_size, end_id, - is_accumulated, num_used_threads); + selected_ids, selected_scores, parent_idx, selected_offsets, pre_ids, + pre_scores, ids, scores, seq_offset_start, seq_offset_end, seq_width, + beam_size, end_id, is_accumulated, num_used_threads); } template __global__ void BeamSearchKernelSingle( - int64_t* selected_ids, float* selected_scores, size_t* selected_offsets, - const int64_t* pre_ids, const float* pre_scores, const int64_t* ids, - const float* scores, const int seq_length, const int seq_width, - int beam_size, int end_id, bool is_accumulated, int num_used_threads) { + int64_t* selected_ids, float* selected_scores, int* parent_idx, + size_t* selected_offsets, const int64_t* pre_ids, const float* pre_scores, + const int64_t* ids, const float* scores, const int seq_length, + const int seq_width, int beam_size, int end_id, bool is_accumulated, + int num_used_threads) { const int seq_offset_start = 0; const int seq_offset_end = seq_length; BeamSearchDetails( - selected_ids, selected_scores, selected_offsets, pre_ids, pre_scores, ids, - scores, seq_offset_start, seq_offset_end, seq_width, beam_size, end_id, - is_accumulated, num_used_threads); + selected_ids, selected_scores, parent_idx, selected_offsets, pre_ids, + pre_scores, ids, scores, seq_offset_start, seq_offset_end, seq_width, + beam_size, end_id, is_accumulated, num_used_threads); } static inline int GetNumUsedThreads(const int max_threads_per_seq, @@ -300,8 +302,9 @@ class BeamSearchFunctor { const framework::LoDTensor* ids, const framework::LoDTensor* scores, framework::LoDTensor* selected_ids, - framework::LoDTensor* selected_scores, size_t level, - size_t beam_size, int end_id, bool is_accumulated) { + framework::LoDTensor* selected_scores, + framework::Tensor* parent_idx, size_t level, size_t beam_size, + int end_id, bool is_accumulated) { auto abs_lod = framework::ToAbsOffset(scores->lod()); const int64_t* pre_ids_data = pre_ids->data(); @@ -322,6 +325,8 @@ class BeamSearchFunctor { selected_ids->mutable_data(selected_dims, context.GetPlace()); float* selected_scores_data = selected_scores->mutable_data(selected_dims, context.GetPlace()); + int* parent_idx_data = parent_idx->mutable_data( + {static_cast(num_seqs * beam_size)}, context.GetPlace()); framework::LoD selected_lod(2); selected_lod[0].assign(abs_lod[level].begin(), abs_lod[level].end()); @@ -339,9 +344,9 @@ class BeamSearchFunctor { CUDA_LAUNCH_KERNEL_HELPER( BeamSearchKernelSingle<<< 1, kMaxThreadsPerSeq, 0, context.stream()>>>( - selected_ids_data, selected_scores_data, selected_offsets, - pre_ids_data, pre_scores_data, ids_data, scores_data, - seq_length, static_cast(seq_width), + selected_ids_data, selected_scores_data, parent_idx_data, + selected_offsets, pre_ids_data, pre_scores_data, ids_data, + scores_data, seq_length, static_cast(seq_width), static_cast(beam_size), static_cast(end_id), is_accumulated, num_used_threads)); } @@ -357,9 +362,9 @@ class BeamSearchFunctor { CUDA_LAUNCH_KERNEL_HELPER( BeamSearchKernel<<< 1, num_seqs * kMaxThreadsPerSeq, 0, context.stream()>>>( - selected_ids_data, selected_scores_data, selected_offsets, - pre_ids_data, pre_scores_data, ids_data, scores_data, - seq_offsets, static_cast(num_seqs), + selected_ids_data, selected_scores_data, parent_idx_data, + selected_offsets, pre_ids_data, pre_scores_data, ids_data, + scores_data, seq_offsets, static_cast(num_seqs), static_cast(seq_width), static_cast(beam_size), end_id, is_accumulated, num_used_threads)); } @@ -379,6 +384,7 @@ class BeamSearchFunctor { {static_cast(selected_lod[1].back()), 1}); selected_ids->Resize(final_selected_dims); selected_scores->Resize(final_selected_dims); + parent_idx->Resize({static_cast(selected_lod[1].back())}); } } }; diff --git a/paddle/fluid/operators/math/beam_search.h b/paddle/fluid/operators/math/beam_search.h index 3cd17f426c5..4474e7ea52a 100644 --- a/paddle/fluid/operators/math/beam_search.h +++ b/paddle/fluid/operators/math/beam_search.h @@ -104,14 +104,12 @@ class BeamSearchFunctor { * Return false if all the input tensor is empty, in machine translation task * that means no candidates is provided, and the task will stop running. */ - void operator()(const DeviceContext& context, - const framework::LoDTensor* pre_ids, - const framework::LoDTensor* pre_scores, - const framework::LoDTensor* ids, - const framework::LoDTensor* scores, - framework::LoDTensor* selected_ids, - framework::LoDTensor* selected_scores, size_t level, - size_t beam_size, int end_id, bool is_accumulated); + void operator()( + const DeviceContext& context, const framework::LoDTensor* pre_ids, + const framework::LoDTensor* pre_scores, const framework::LoDTensor* ids, + const framework::LoDTensor* scores, framework::LoDTensor* selected_ids, + framework::LoDTensor* selected_scores, framework::Tensor* parent_idx, + size_t level, size_t beam_size, int end_id, bool is_accumulated); }; } // namespace math diff --git a/paddle/fluid/operators/math/beam_search_test.cc b/paddle/fluid/operators/math/beam_search_test.cc index 1c29ee95f6b..7ea8eb8b00d 100644 --- a/paddle/fluid/operators/math/beam_search_test.cc +++ b/paddle/fluid/operators/math/beam_search_test.cc @@ -93,13 +93,14 @@ void TestBeamSearch() { paddle::framework::LoDTensor selected_ids; paddle::framework::LoDTensor selected_scores; + paddle::framework::LoDTensor parent_idx; size_t level = 0; size_t beam_size = 2; int end_id = 0; paddle::operators::math::BeamSearchFunctor beamsearch; beamsearch(*context, &pre_ids, &pre_scores, &ids, &scores, &selected_ids, - &selected_scores, level, beam_size, end_id, true); + &selected_scores, &parent_idx, level, beam_size, end_id, true); ASSERT_EQ(selected_ids.lod(), selected_scores.lod()); diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 0dbcf442a3b..0e4b5aadc0b 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -3877,7 +3877,8 @@ def beam_search(pre_ids, end_id, level=0, is_accumulated=True, - name=None): + name=None, + return_parent_idx=False): """ Beam search is a classical algorithm for selecting candidate words in a machine translation task. @@ -3933,10 +3934,16 @@ def beam_search(pre_ids, accumulated scores. name(str|None): A name for this layer(optional). If set None, the layer will be named automatically. + return_parent_idx(bool): Whether to return an extra Tensor variable + preserving the selected_ids' parent indice in pre_ids + in output, which can be used to gather cell states at + the next time step. Returns: - Variable: The LodTensor pair containing the selected ids and the \ - corresponding scores. + Variable: The LodTensor tuple containing the selected ids and the \ + corresponding scores. If :attr:`return_parent_idx` is :attr:`True`, \ + an extra Tensor variable preserving the selected_ids' parent indice \ + is included. Examples: .. code-block:: python @@ -3969,6 +3976,11 @@ def beam_search(pre_ids, selected_scores = helper.create_variable_for_type_inference( dtype=score_type) selected_ids = helper.create_variable_for_type_inference(dtype=id_type) + # parent_idx is a tensor used to gather cell states at the next time + # step. Though lod in selected_ids can also be used to gather by + # sequence_expand, it is not efficient. + # gather_op's index input only supports int32 dtype currently + parent_idx = helper.create_variable_for_type_inference(dtype="int32") helper.append_op( type='beam_search', @@ -3976,6 +3988,7 @@ def beam_search(pre_ids, outputs={ 'selected_ids': selected_ids, 'selected_scores': selected_scores, + 'parent_idx': parent_idx }, attrs={ # TODO(ChunweiYan) to assure other value support @@ -3984,8 +3997,10 @@ def beam_search(pre_ids, 'end_id': end_id, 'is_accumulated': is_accumulated, }) - - return selected_ids, selected_scores + if return_parent_idx: + return selected_ids, selected_scores, parent_idx + else: + return selected_ids, selected_scores def beam_search_decode(ids, scores, beam_size, end_id, name=None): diff --git a/python/paddle/fluid/tests/unittests/test_beam_search_op.py b/python/paddle/fluid/tests/unittests/test_beam_search_op.py index c28dda4b53c..1d9f4b78f30 100644 --- a/python/paddle/fluid/tests/unittests/test_beam_search_op.py +++ b/python/paddle/fluid/tests/unittests/test_beam_search_op.py @@ -38,6 +38,7 @@ class BeamSearchOpTester(unittest.TestCase): self._create_pre_ids() self.scope.var('selected_ids') self.scope.var('selected_scores') + self.scope.var('parent_idx') def test_run(self): op = Operator( @@ -48,12 +49,14 @@ class BeamSearchOpTester(unittest.TestCase): scores='scores', selected_ids='selected_ids', selected_scores='selected_scores', + parent_idx='parent_idx', level=0, beam_size=2, end_id=0, ) op.run(self.scope, core.CPUPlace()) selected_ids = self.scope.find_var("selected_ids").get_tensor() selected_scores = self.scope.find_var("selected_scores").get_tensor() + parent_idx = self.scope.find_var("parent_idx").get_tensor() self.assertTrue( np.allclose( np.array(selected_ids), np.array([4, 2, 3, 8])[:, np.newaxis])) @@ -62,6 +65,8 @@ class BeamSearchOpTester(unittest.TestCase): np.array(selected_scores), np.array([0.5, 0.6, 0.9, 0.7])[:, np.newaxis])) self.assertEqual(selected_ids.lod(), [[0, 2, 4], [0, 1, 2, 3, 4]]) + self.assertTrue( + np.allclose(np.array(parent_idx), np.array([0, 1, 2, 3]))) def _create_pre_ids(self): np_data = np.array([[1, 2, 3, 4]], dtype='int64') -- GitLab