未验证 提交 16d54f7f 编写于 作者: Y Yiqun Liu 提交者: GitHub

Return parent_idx in beam_search op (#15520)

* Refine beam_search_op to output an extra parent_idx tensor.
test=develop

* Fix the unittest test_beam_search_op.
test=develop

* Fix the merging mistake.
test=develop
上级 72ee3c62
......@@ -122,7 +122,7 @@ paddle.fluid.layers.transpose ArgSpec(args=['x', 'perm', 'name'], varargs=None,
paddle.fluid.layers.im2sequence ArgSpec(args=['input', 'filter_size', 'stride', 'padding', 'input_image_size', 'out_stride', 'name'], varargs=None, keywords=None, defaults=(1, 1, 0, None, 1, None))
paddle.fluid.layers.nce ArgSpec(args=['input', 'label', 'num_total_classes', 'sample_weight', 'param_attr', 'bias_attr', 'num_neg_samples', 'name', 'sampler', 'custom_dist', 'seed', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 'uniform', None, 0, False))
paddle.fluid.layers.hsigmoid ArgSpec(args=['input', 'label', 'num_classes', 'param_attr', 'bias_attr', 'name', 'path_table', 'path_code', 'is_custom', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, False, False))
paddle.fluid.layers.beam_search ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 'scores', 'beam_size', 'end_id', 'level', 'is_accumulated', 'name'], varargs=None, keywords=None, defaults=(0, True, None))
paddle.fluid.layers.beam_search ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 'scores', 'beam_size', 'end_id', 'level', 'is_accumulated', 'name', 'return_parent_idx'], varargs=None, keywords=None, defaults=(0, True, None, False))
paddle.fluid.layers.row_conv ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.layers.multiplex ArgSpec(args=['inputs', 'index'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.layer_norm ArgSpec(args=['input', 'scale', 'shift', 'begin_norm_axis', 'epsilon', 'param_attr', 'bias_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(True, True, 1, 1e-05, None, None, None, None))
......
......@@ -51,6 +51,9 @@ class BeamSearchOpMaker : public framework::OpProtoAndCheckerMaker {
AddOutput("selected_scores",
"A LoDTensor containing the accumulated scores corresponding to "
"Output(selected_ids).");
AddOutput(
"parent_idx",
"A Tensor preserving the selected_ids' parent indice in pre_ids.");
// Attributes stored in AttributeMap
AddAttr<int>("level", "the level of LoDTensor");
......
......@@ -41,13 +41,15 @@ class BeamSearchOpKernel : public framework::OpKernel<T> {
auto selected_ids = context.Output<framework::LoDTensor>("selected_ids");
auto selected_scores =
context.Output<framework::LoDTensor>("selected_scores");
auto* parent_idx = context.Output<framework::Tensor>("parent_idx");
PADDLE_ENFORCE_NOT_NULL(selected_ids);
PADDLE_ENFORCE_NOT_NULL(selected_scores);
PADDLE_ENFORCE_NOT_NULL(parent_idx);
math::BeamSearchFunctor<DeviceContext, T> alg;
alg(context.template device_context<DeviceContext>(), pre_ids, pre_scores,
ids, scores, selected_ids, selected_scores, level, beam_size, end_id,
is_accumulated);
ids, scores, selected_ids, selected_scores, parent_idx, level,
beam_size, end_id, is_accumulated);
}
};
......
......@@ -31,7 +31,7 @@ class GatherOpCUDAKernel : public framework::OpKernel<T> {
auto *output = ctx.Output<Tensor>("Out");
output->mutable_data<T>(ctx.GetPlace());
if (x->numel() == 0) return;
GPUGather<T>(ctx.device_context(), *x, *index, output);
}
};
......@@ -45,14 +45,13 @@ class GatherGradOpCUDAKernel : public framework::OpKernel<T> {
auto *Index = ctx.Input<Tensor>("Index");
auto *dX = ctx.Output<Tensor>(framework::GradVarName("X"));
auto *dO = ctx.Input<Tensor>(framework::GradVarName("Out"));
auto *x = ctx.Input<Tensor>("X");
dX->mutable_data<T>(ctx.GetPlace());
auto dxt = framework::EigenVector<T>::Flatten(*dX);
auto &place = *ctx.template device_context<platform::CUDADeviceContext>()
.eigen_device();
dxt.device(place) = dxt.constant(static_cast<T>(0));
if (dO->numel() == 0) return;
GPUScatterAssign<T>(ctx.device_context(), *dO, *Index, dX);
}
};
......
......@@ -35,7 +35,7 @@ class GatherOpKernel : public framework::OpKernel<T> {
auto *output = ctx.Output<Tensor>("Out");
output->mutable_data<T>(ctx.GetPlace());
if (x->numel() == 0) return;
CPUGather<T>(ctx.device_context(), *x, *index, output);
}
};
......@@ -56,7 +56,7 @@ class GatherGradientOpKernel : public framework::OpKernel<T> {
auto &place = *ctx.template device_context<platform::CPUDeviceContext>()
.eigen_device();
dxt.device(place) = dxt.constant(static_cast<T>(0));
if (dO->numel() == 0) return;
ScatterAssign<T>(ctx.device_context(), *dO, *Index, dX);
}
};
......
......@@ -29,8 +29,9 @@ class BeamSearchFunctor<platform::CPUDeviceContext, T> {
const framework::LoDTensor *ids,
const framework::LoDTensor *scores,
framework::LoDTensor *selected_ids,
framework::LoDTensor *selected_scores, size_t level,
size_t beam_size, int end_id, bool is_accumulated) {
framework::LoDTensor *selected_scores,
framework::Tensor *parent_idx, size_t level, size_t beam_size,
int end_id, bool is_accumulated) {
auto abs_lod = framework::ToAbsOffset(scores->lod());
auto &high_level = abs_lod[level];
......@@ -57,11 +58,13 @@ class BeamSearchFunctor<platform::CPUDeviceContext, T> {
std::vector<int64_t>({static_cast<int>(num_instances), 1}));
selected_ids->Resize(dims);
selected_scores->Resize(dims);
parent_idx->Resize({static_cast<int64_t>(num_instances)});
auto *selected_ids_data =
selected_ids->mutable_data<int64_t>(platform::CPUPlace());
auto *selected_scores_data =
selected_scores->mutable_data<float>(platform::CPUPlace());
auto *parent_idx_data = parent_idx->mutable_data<int>(platform::CPUPlace());
// fill in data
std::vector<size_t> low_level;
......@@ -69,6 +72,7 @@ class BeamSearchFunctor<platform::CPUDeviceContext, T> {
for (auto &items : selected_items) {
low_level.push_back(low_offset);
for (auto &item : items) {
parent_idx_data[low_offset] = static_cast<int>(low_level.size() - 1);
selected_ids_data[low_offset] = item.id;
selected_scores_data[low_offset] = item.score;
low_offset++;
......
......@@ -157,10 +157,10 @@ __device__ __forceinline__ bool PruneEndBeams(Triple* top_beam_local,
}
__device__ __forceinline__ void WriteBack(
int64_t* selected_ids, float* selected_scores, size_t* selected_offsets,
Triple* top_beam_local, const int seq_offset_start,
const int seq_offset_end, const int selected_seq_start,
const int selected_seq_length) {
int64_t* selected_ids, float* selected_scores, int* parent_idx,
size_t* selected_offsets, Triple* top_beam_local,
const int seq_offset_start, const int seq_offset_end,
const int selected_seq_start, const int selected_seq_length) {
const int tid = threadIdx.x; // use 1 thread only for each sequence
int global_index = selected_seq_start;
for (int global_offset = seq_offset_start; global_offset < seq_offset_end;
......@@ -171,6 +171,7 @@ __device__ __forceinline__ void WriteBack(
selected_ids[global_index] =
static_cast<int64_t>(top_beam_local[local_index].id);
selected_scores[global_index] = top_beam_local[local_index].score;
parent_idx[global_index] = static_cast<int>(global_offset);
global_index++;
}
}
......@@ -180,11 +181,11 @@ __device__ __forceinline__ void WriteBack(
template <int MaxLength, int MaxThreadsPerSeq, int MaxSeqs>
__device__ void BeamSearchDetails(
int64_t* selected_ids, float* selected_scores, size_t* selected_offsets,
const int64_t* pre_ids, const float* pre_scores, const int64_t* ids,
const float* scores, const int seq_offset_start, const int seq_offset_end,
const int seq_width, int beam_size, int end_id, bool is_accumulated,
int num_used_threads) {
int64_t* selected_ids, float* selected_scores, int* parent_idx,
size_t* selected_offsets, const int64_t* pre_ids, const float* pre_scores,
const int64_t* ids, const float* scores, const int seq_offset_start,
const int seq_offset_end, const int seq_width, int beam_size, int end_id,
bool is_accumulated, int num_used_threads) {
__shared__ Triple top_beam[MaxLength];
int num_items = 0;
......@@ -228,15 +229,15 @@ __device__ void BeamSearchDetails(
selected_offsets[0] = 0;
}
WriteBack(selected_ids, selected_scores, selected_offsets, top_beam_local,
seq_offset_start, seq_offset_end, selected_seq_start,
selected_seq_length);
WriteBack(selected_ids, selected_scores, parent_idx, selected_offsets,
top_beam_local, seq_offset_start, seq_offset_end,
selected_seq_start, selected_seq_length);
}
}
template <int MaxLength, int MaxThreadsPerSeq, int MaxSeqs>
__global__ void BeamSearchKernel(int64_t* selected_ids, float* selected_scores,
size_t* selected_offsets,
int* parent_idx, size_t* selected_offsets,
const int64_t* pre_ids,
const float* pre_scores, const int64_t* ids,
const float* scores, const size_t* seq_offsets,
......@@ -250,24 +251,25 @@ __global__ void BeamSearchKernel(int64_t* selected_ids, float* selected_scores,
int seq_offset_end = static_cast<int>(seq_offsets[seq_id + 1]);
BeamSearchDetails<MaxLength, MaxThreadsPerSeq, MaxSeqs>(
selected_ids, selected_scores, selected_offsets, pre_ids, pre_scores, ids,
scores, seq_offset_start, seq_offset_end, seq_width, beam_size, end_id,
is_accumulated, num_used_threads);
selected_ids, selected_scores, parent_idx, selected_offsets, pre_ids,
pre_scores, ids, scores, seq_offset_start, seq_offset_end, seq_width,
beam_size, end_id, is_accumulated, num_used_threads);
}
template <int MaxLength, int MaxThreadsPerSeq>
__global__ void BeamSearchKernelSingle(
int64_t* selected_ids, float* selected_scores, size_t* selected_offsets,
const int64_t* pre_ids, const float* pre_scores, const int64_t* ids,
const float* scores, const int seq_length, const int seq_width,
int beam_size, int end_id, bool is_accumulated, int num_used_threads) {
int64_t* selected_ids, float* selected_scores, int* parent_idx,
size_t* selected_offsets, const int64_t* pre_ids, const float* pre_scores,
const int64_t* ids, const float* scores, const int seq_length,
const int seq_width, int beam_size, int end_id, bool is_accumulated,
int num_used_threads) {
const int seq_offset_start = 0;
const int seq_offset_end = seq_length;
BeamSearchDetails<MaxLength, MaxThreadsPerSeq, 1>(
selected_ids, selected_scores, selected_offsets, pre_ids, pre_scores, ids,
scores, seq_offset_start, seq_offset_end, seq_width, beam_size, end_id,
is_accumulated, num_used_threads);
selected_ids, selected_scores, parent_idx, selected_offsets, pre_ids,
pre_scores, ids, scores, seq_offset_start, seq_offset_end, seq_width,
beam_size, end_id, is_accumulated, num_used_threads);
}
static inline int GetNumUsedThreads(const int max_threads_per_seq,
......@@ -300,8 +302,9 @@ class BeamSearchFunctor<platform::CUDADeviceContext, T> {
const framework::LoDTensor* ids,
const framework::LoDTensor* scores,
framework::LoDTensor* selected_ids,
framework::LoDTensor* selected_scores, size_t level,
size_t beam_size, int end_id, bool is_accumulated) {
framework::LoDTensor* selected_scores,
framework::Tensor* parent_idx, size_t level, size_t beam_size,
int end_id, bool is_accumulated) {
auto abs_lod = framework::ToAbsOffset(scores->lod());
const int64_t* pre_ids_data = pre_ids->data<int64_t>();
......@@ -322,6 +325,8 @@ class BeamSearchFunctor<platform::CUDADeviceContext, T> {
selected_ids->mutable_data<int64_t>(selected_dims, context.GetPlace());
float* selected_scores_data =
selected_scores->mutable_data<float>(selected_dims, context.GetPlace());
int* parent_idx_data = parent_idx->mutable_data<int>(
{static_cast<int64_t>(num_seqs * beam_size)}, context.GetPlace());
framework::LoD selected_lod(2);
selected_lod[0].assign(abs_lod[level].begin(), abs_lod[level].end());
......@@ -339,9 +344,9 @@ class BeamSearchFunctor<platform::CUDADeviceContext, T> {
CUDA_LAUNCH_KERNEL_HELPER(
BeamSearchKernelSingle<kPowerOfTwoDim, kMaxThreadsPerSeq><<<
1, kMaxThreadsPerSeq, 0, context.stream()>>>(
selected_ids_data, selected_scores_data, selected_offsets,
pre_ids_data, pre_scores_data, ids_data, scores_data,
seq_length, static_cast<int>(seq_width),
selected_ids_data, selected_scores_data, parent_idx_data,
selected_offsets, pre_ids_data, pre_scores_data, ids_data,
scores_data, seq_length, static_cast<int>(seq_width),
static_cast<int>(beam_size), static_cast<int>(end_id),
is_accumulated, num_used_threads));
}
......@@ -357,9 +362,9 @@ class BeamSearchFunctor<platform::CUDADeviceContext, T> {
CUDA_LAUNCH_KERNEL_HELPER(
BeamSearchKernel<kPowerOfTwoDim, kMaxThreadsPerSeq, kMaxSeqs><<<
1, num_seqs * kMaxThreadsPerSeq, 0, context.stream()>>>(
selected_ids_data, selected_scores_data, selected_offsets,
pre_ids_data, pre_scores_data, ids_data, scores_data,
seq_offsets, static_cast<int>(num_seqs),
selected_ids_data, selected_scores_data, parent_idx_data,
selected_offsets, pre_ids_data, pre_scores_data, ids_data,
scores_data, seq_offsets, static_cast<int>(num_seqs),
static_cast<int>(seq_width), static_cast<int>(beam_size),
end_id, is_accumulated, num_used_threads));
}
......@@ -379,6 +384,7 @@ class BeamSearchFunctor<platform::CUDADeviceContext, T> {
{static_cast<int64_t>(selected_lod[1].back()), 1});
selected_ids->Resize(final_selected_dims);
selected_scores->Resize(final_selected_dims);
parent_idx->Resize({static_cast<int64_t>(selected_lod[1].back())});
}
}
};
......
......@@ -104,14 +104,12 @@ class BeamSearchFunctor {
* Return false if all the input tensor is empty, in machine translation task
* that means no candidates is provided, and the task will stop running.
*/
void operator()(const DeviceContext& context,
const framework::LoDTensor* pre_ids,
const framework::LoDTensor* pre_scores,
const framework::LoDTensor* ids,
const framework::LoDTensor* scores,
framework::LoDTensor* selected_ids,
framework::LoDTensor* selected_scores, size_t level,
size_t beam_size, int end_id, bool is_accumulated);
void operator()(
const DeviceContext& context, const framework::LoDTensor* pre_ids,
const framework::LoDTensor* pre_scores, const framework::LoDTensor* ids,
const framework::LoDTensor* scores, framework::LoDTensor* selected_ids,
framework::LoDTensor* selected_scores, framework::Tensor* parent_idx,
size_t level, size_t beam_size, int end_id, bool is_accumulated);
};
} // namespace math
......
......@@ -93,13 +93,14 @@ void TestBeamSearch() {
paddle::framework::LoDTensor selected_ids;
paddle::framework::LoDTensor selected_scores;
paddle::framework::LoDTensor parent_idx;
size_t level = 0;
size_t beam_size = 2;
int end_id = 0;
paddle::operators::math::BeamSearchFunctor<DeviceContext, float> beamsearch;
beamsearch(*context, &pre_ids, &pre_scores, &ids, &scores, &selected_ids,
&selected_scores, level, beam_size, end_id, true);
&selected_scores, &parent_idx, level, beam_size, end_id, true);
ASSERT_EQ(selected_ids.lod(), selected_scores.lod());
......
......@@ -3877,7 +3877,8 @@ def beam_search(pre_ids,
end_id,
level=0,
is_accumulated=True,
name=None):
name=None,
return_parent_idx=False):
"""
Beam search is a classical algorithm for selecting candidate words in a
machine translation task.
......@@ -3933,10 +3934,16 @@ def beam_search(pre_ids,
accumulated scores.
name(str|None): A name for this layer(optional). If set None, the layer
will be named automatically.
return_parent_idx(bool): Whether to return an extra Tensor variable
preserving the selected_ids' parent indice in pre_ids
in output, which can be used to gather cell states at
the next time step.
Returns:
Variable: The LodTensor pair containing the selected ids and the \
corresponding scores.
Variable: The LodTensor tuple containing the selected ids and the \
corresponding scores. If :attr:`return_parent_idx` is :attr:`True`, \
an extra Tensor variable preserving the selected_ids' parent indice \
is included.
Examples:
.. code-block:: python
......@@ -3969,6 +3976,11 @@ def beam_search(pre_ids,
selected_scores = helper.create_variable_for_type_inference(
dtype=score_type)
selected_ids = helper.create_variable_for_type_inference(dtype=id_type)
# parent_idx is a tensor used to gather cell states at the next time
# step. Though lod in selected_ids can also be used to gather by
# sequence_expand, it is not efficient.
# gather_op's index input only supports int32 dtype currently
parent_idx = helper.create_variable_for_type_inference(dtype="int32")
helper.append_op(
type='beam_search',
......@@ -3976,6 +3988,7 @@ def beam_search(pre_ids,
outputs={
'selected_ids': selected_ids,
'selected_scores': selected_scores,
'parent_idx': parent_idx
},
attrs={
# TODO(ChunweiYan) to assure other value support
......@@ -3984,8 +3997,10 @@ def beam_search(pre_ids,
'end_id': end_id,
'is_accumulated': is_accumulated,
})
return selected_ids, selected_scores
if return_parent_idx:
return selected_ids, selected_scores, parent_idx
else:
return selected_ids, selected_scores
def beam_search_decode(ids, scores, beam_size, end_id, name=None):
......
......@@ -38,6 +38,7 @@ class BeamSearchOpTester(unittest.TestCase):
self._create_pre_ids()
self.scope.var('selected_ids')
self.scope.var('selected_scores')
self.scope.var('parent_idx')
def test_run(self):
op = Operator(
......@@ -48,12 +49,14 @@ class BeamSearchOpTester(unittest.TestCase):
scores='scores',
selected_ids='selected_ids',
selected_scores='selected_scores',
parent_idx='parent_idx',
level=0,
beam_size=2,
end_id=0, )
op.run(self.scope, core.CPUPlace())
selected_ids = self.scope.find_var("selected_ids").get_tensor()
selected_scores = self.scope.find_var("selected_scores").get_tensor()
parent_idx = self.scope.find_var("parent_idx").get_tensor()
self.assertTrue(
np.allclose(
np.array(selected_ids), np.array([4, 2, 3, 8])[:, np.newaxis]))
......@@ -62,6 +65,8 @@ class BeamSearchOpTester(unittest.TestCase):
np.array(selected_scores),
np.array([0.5, 0.6, 0.9, 0.7])[:, np.newaxis]))
self.assertEqual(selected_ids.lod(), [[0, 2, 4], [0, 1, 2, 3, 4]])
self.assertTrue(
np.allclose(np.array(parent_idx), np.array([0, 1, 2, 3])))
def _create_pre_ids(self):
np_data = np.array([[1, 2, 3, 4]], dtype='int64')
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册