diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index 349460ad98eea86e89f58a0e8272dde2b5955026..fe8d6dd42595dfc9dc4bf17e336df801b582703e 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -122,7 +122,7 @@ paddle.fluid.layers.transpose ArgSpec(args=['x', 'perm', 'name'], varargs=None,
 paddle.fluid.layers.im2sequence ArgSpec(args=['input', 'filter_size', 'stride', 'padding', 'input_image_size', 'out_stride', 'name'], varargs=None, keywords=None, defaults=(1, 1, 0, None, 1, None))
 paddle.fluid.layers.nce ArgSpec(args=['input', 'label', 'num_total_classes', 'sample_weight', 'param_attr', 'bias_attr', 'num_neg_samples', 'name', 'sampler', 'custom_dist', 'seed', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 'uniform', None, 0, False))
 paddle.fluid.layers.hsigmoid ArgSpec(args=['input', 'label', 'num_classes', 'param_attr', 'bias_attr', 'name', 'path_table', 'path_code', 'is_custom', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, False, False))
-paddle.fluid.layers.beam_search ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 'scores', 'beam_size', 'end_id', 'level', 'is_accumulated', 'name'], varargs=None, keywords=None, defaults=(0, True, None))
+paddle.fluid.layers.beam_search ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 'scores', 'beam_size', 'end_id', 'level', 'is_accumulated', 'name', 'return_parent_idx'], varargs=None, keywords=None, defaults=(0, True, None, False))
 paddle.fluid.layers.row_conv ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None))
 paddle.fluid.layers.multiplex ArgSpec(args=['inputs', 'index'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.layer_norm ArgSpec(args=['input', 'scale', 'shift', 'begin_norm_axis', 'epsilon', 'param_attr', 'bias_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(True, True, 1, 1e-05, None, None, None, None))
diff --git a/paddle/fluid/operators/beam_search_op.cc b/paddle/fluid/operators/beam_search_op.cc
index e78ecc1a12309fe084a4165e5bb0d8bfb1dcf957..e93cd8615e052e4dfc6255549bf7a9b84b7dd657 100644
--- a/paddle/fluid/operators/beam_search_op.cc
+++ b/paddle/fluid/operators/beam_search_op.cc
@@ -51,6 +51,9 @@ class BeamSearchOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("selected_scores",
               "A LoDTensor containing the accumulated scores corresponding to "
               "Output(selected_ids).");
+    AddOutput(
+        "parent_idx",
+        "A Tensor preserving the selected_ids' parent indice in pre_ids.");
 
     // Attributes stored in AttributeMap
     AddAttr<int>("level", "the level of LoDTensor");
diff --git a/paddle/fluid/operators/beam_search_op.h b/paddle/fluid/operators/beam_search_op.h
index 1b939e742de06aedf187d25d002d19e0a4fafc9d..f808020cc765585d1633c6c3bf528080a7e83f07 100644
--- a/paddle/fluid/operators/beam_search_op.h
+++ b/paddle/fluid/operators/beam_search_op.h
@@ -41,13 +41,15 @@ class BeamSearchOpKernel : public framework::OpKernel<T> {
     auto selected_ids = context.Output<framework::LoDTensor>("selected_ids");
     auto selected_scores =
         context.Output<framework::LoDTensor>("selected_scores");
+    auto* parent_idx = context.Output<framework::Tensor>("parent_idx");
     PADDLE_ENFORCE_NOT_NULL(selected_ids);
     PADDLE_ENFORCE_NOT_NULL(selected_scores);
+    PADDLE_ENFORCE_NOT_NULL(parent_idx);
 
     math::BeamSearchFunctor<DeviceContext, T> alg;
     alg(context.template device_context<DeviceContext>(), pre_ids, pre_scores,
-        ids, scores, selected_ids, selected_scores, level, beam_size, end_id,
-        is_accumulated);
+        ids, scores, selected_ids, selected_scores, parent_idx, level,
+        beam_size, end_id, is_accumulated);
   }
 };
 
diff --git a/paddle/fluid/operators/gather_op.cu b/paddle/fluid/operators/gather_op.cu
index 9f4aef08cd58e72ce344a640e6564b9e360ce169..427ac61858ea6de9dc59ac52d1b7e77a90d5139c 100644
--- a/paddle/fluid/operators/gather_op.cu
+++ b/paddle/fluid/operators/gather_op.cu
@@ -31,7 +31,7 @@ class GatherOpCUDAKernel : public framework::OpKernel<T> {
     auto *output = ctx.Output<Tensor>("Out");
 
     output->mutable_data<T>(ctx.GetPlace());
-
+    if (x->numel() == 0) return;
     GPUGather<T>(ctx.device_context(), *x, *index, output);
   }
 };
@@ -45,14 +45,13 @@ class GatherGradOpCUDAKernel : public framework::OpKernel<T> {
     auto *Index = ctx.Input<Tensor>("Index");
     auto *dX = ctx.Output<Tensor>(framework::GradVarName("X"));
     auto *dO = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto *x = ctx.Input<Tensor>("X");
 
     dX->mutable_data<T>(ctx.GetPlace());
     auto dxt = framework::EigenVector<T>::Flatten(*dX);
     auto &place = *ctx.template device_context<platform::CUDADeviceContext>()
                        .eigen_device();
     dxt.device(place) = dxt.constant(static_cast<T>(0));
-
+    if (dO->numel() == 0) return;
     GPUScatterAssign<T>(ctx.device_context(), *dO, *Index, dX);
   }
 };
diff --git a/paddle/fluid/operators/gather_op.h b/paddle/fluid/operators/gather_op.h
index 2dd726bebb1bc2e4d83844c0b98df01c390e622f..2e18298cf8e34d5f70369c89b3b3b2a9ced0ce62 100644
--- a/paddle/fluid/operators/gather_op.h
+++ b/paddle/fluid/operators/gather_op.h
@@ -35,7 +35,7 @@ class GatherOpKernel : public framework::OpKernel<T> {
     auto *output = ctx.Output<Tensor>("Out");
 
     output->mutable_data<T>(ctx.GetPlace());
-
+    if (x->numel() == 0) return;
     CPUGather<T>(ctx.device_context(), *x, *index, output);
   }
 };
@@ -56,7 +56,7 @@ class GatherGradientOpKernel : public framework::OpKernel<T> {
     auto &place = *ctx.template device_context<platform::CPUDeviceContext>()
                        .eigen_device();
     dxt.device(place) = dxt.constant(static_cast<T>(0));
-
+    if (dO->numel() == 0) return;
     ScatterAssign<T>(ctx.device_context(), *dO, *Index, dX);
   }
 };
diff --git a/paddle/fluid/operators/math/beam_search.cc b/paddle/fluid/operators/math/beam_search.cc
index fb7119273a734feba870fdabade6a4faa1d5e9a3..69971ef7423eff6bc3f8543a491edb6b0bbd00ca 100644
--- a/paddle/fluid/operators/math/beam_search.cc
+++ b/paddle/fluid/operators/math/beam_search.cc
@@ -29,8 +29,9 @@ class BeamSearchFunctor<platform::CPUDeviceContext, T> {
                   const framework::LoDTensor *ids,
                   const framework::LoDTensor *scores,
                   framework::LoDTensor *selected_ids,
-                  framework::LoDTensor *selected_scores, size_t level,
-                  size_t beam_size, int end_id, bool is_accumulated) {
+                  framework::LoDTensor *selected_scores,
+                  framework::Tensor *parent_idx, size_t level, size_t beam_size,
+                  int end_id, bool is_accumulated) {
     auto abs_lod = framework::ToAbsOffset(scores->lod());
     auto &high_level = abs_lod[level];
 
@@ -57,11 +58,13 @@ class BeamSearchFunctor<platform::CPUDeviceContext, T> {
         std::vector<int64_t>({static_cast<int>(num_instances), 1}));
     selected_ids->Resize(dims);
     selected_scores->Resize(dims);
+    parent_idx->Resize({static_cast<int64_t>(num_instances)});
 
     auto *selected_ids_data =
         selected_ids->mutable_data<int64_t>(platform::CPUPlace());
     auto *selected_scores_data =
         selected_scores->mutable_data<float>(platform::CPUPlace());
+    auto *parent_idx_data = parent_idx->mutable_data<int>(platform::CPUPlace());
 
     // fill in data
     std::vector<size_t> low_level;
@@ -69,6 +72,7 @@ class BeamSearchFunctor<platform::CPUDeviceContext, T> {
     for (auto &items : selected_items) {
       low_level.push_back(low_offset);
       for (auto &item : items) {
+        parent_idx_data[low_offset] = static_cast<int>(low_level.size() - 1);
         selected_ids_data[low_offset] = item.id;
         selected_scores_data[low_offset] = item.score;
         low_offset++;
diff --git a/paddle/fluid/operators/math/beam_search.cu b/paddle/fluid/operators/math/beam_search.cu
index d94e3023ce537cb9fa456e079c4fa3cf57fb954d..61d021ef627f1ccd90b992c2078a7f3ca879422d 100644
--- a/paddle/fluid/operators/math/beam_search.cu
+++ b/paddle/fluid/operators/math/beam_search.cu
@@ -157,10 +157,10 @@ __device__ __forceinline__ bool PruneEndBeams(Triple* top_beam_local,
 }
 
 __device__ __forceinline__ void WriteBack(
-    int64_t* selected_ids, float* selected_scores, size_t* selected_offsets,
-    Triple* top_beam_local, const int seq_offset_start,
-    const int seq_offset_end, const int selected_seq_start,
-    const int selected_seq_length) {
+    int64_t* selected_ids, float* selected_scores, int* parent_idx,
+    size_t* selected_offsets, Triple* top_beam_local,
+    const int seq_offset_start, const int seq_offset_end,
+    const int selected_seq_start, const int selected_seq_length) {
   const int tid = threadIdx.x;  // use 1 thread only for each sequence
   int global_index = selected_seq_start;
   for (int global_offset = seq_offset_start; global_offset < seq_offset_end;
@@ -171,6 +171,7 @@ __device__ __forceinline__ void WriteBack(
         selected_ids[global_index] =
             static_cast<int64_t>(top_beam_local[local_index].id);
         selected_scores[global_index] = top_beam_local[local_index].score;
+        parent_idx[global_index] = static_cast<int>(global_offset);
         global_index++;
       }
     }
@@ -180,11 +181,11 @@ __device__ __forceinline__ void WriteBack(
 
 template <int MaxLength, int MaxThreadsPerSeq, int MaxSeqs>
 __device__ void BeamSearchDetails(
-    int64_t* selected_ids, float* selected_scores, size_t* selected_offsets,
-    const int64_t* pre_ids, const float* pre_scores, const int64_t* ids,
-    const float* scores, const int seq_offset_start, const int seq_offset_end,
-    const int seq_width, int beam_size, int end_id, bool is_accumulated,
-    int num_used_threads) {
+    int64_t* selected_ids, float* selected_scores, int* parent_idx,
+    size_t* selected_offsets, const int64_t* pre_ids, const float* pre_scores,
+    const int64_t* ids, const float* scores, const int seq_offset_start,
+    const int seq_offset_end, const int seq_width, int beam_size, int end_id,
+    bool is_accumulated, int num_used_threads) {
   __shared__ Triple top_beam[MaxLength];
 
   int num_items = 0;
@@ -228,15 +229,15 @@ __device__ void BeamSearchDetails(
       selected_offsets[0] = 0;
     }
 
-    WriteBack(selected_ids, selected_scores, selected_offsets, top_beam_local,
-              seq_offset_start, seq_offset_end, selected_seq_start,
-              selected_seq_length);
+    WriteBack(selected_ids, selected_scores, parent_idx, selected_offsets,
+              top_beam_local, seq_offset_start, seq_offset_end,
+              selected_seq_start, selected_seq_length);
   }
 }
 
 template <int MaxLength, int MaxThreadsPerSeq, int MaxSeqs>
 __global__ void BeamSearchKernel(int64_t* selected_ids, float* selected_scores,
-                                 size_t* selected_offsets,
+                                 int* parent_idx, size_t* selected_offsets,
                                  const int64_t* pre_ids,
                                  const float* pre_scores, const int64_t* ids,
                                  const float* scores, const size_t* seq_offsets,
@@ -250,24 +251,25 @@ __global__ void BeamSearchKernel(int64_t* selected_ids, float* selected_scores,
   int seq_offset_end = static_cast<int>(seq_offsets[seq_id + 1]);
 
   BeamSearchDetails<MaxLength, MaxThreadsPerSeq, MaxSeqs>(
-      selected_ids, selected_scores, selected_offsets, pre_ids, pre_scores, ids,
-      scores, seq_offset_start, seq_offset_end, seq_width, beam_size, end_id,
-      is_accumulated, num_used_threads);
+      selected_ids, selected_scores, parent_idx, selected_offsets, pre_ids,
+      pre_scores, ids, scores, seq_offset_start, seq_offset_end, seq_width,
+      beam_size, end_id, is_accumulated, num_used_threads);
 }
 
 template <int MaxLength, int MaxThreadsPerSeq>
 __global__ void BeamSearchKernelSingle(
-    int64_t* selected_ids, float* selected_scores, size_t* selected_offsets,
-    const int64_t* pre_ids, const float* pre_scores, const int64_t* ids,
-    const float* scores, const int seq_length, const int seq_width,
-    int beam_size, int end_id, bool is_accumulated, int num_used_threads) {
+    int64_t* selected_ids, float* selected_scores, int* parent_idx,
+    size_t* selected_offsets, const int64_t* pre_ids, const float* pre_scores,
+    const int64_t* ids, const float* scores, const int seq_length,
+    const int seq_width, int beam_size, int end_id, bool is_accumulated,
+    int num_used_threads) {
   const int seq_offset_start = 0;
   const int seq_offset_end = seq_length;
 
   BeamSearchDetails<MaxLength, MaxThreadsPerSeq, 1>(
-      selected_ids, selected_scores, selected_offsets, pre_ids, pre_scores, ids,
-      scores, seq_offset_start, seq_offset_end, seq_width, beam_size, end_id,
-      is_accumulated, num_used_threads);
+      selected_ids, selected_scores, parent_idx, selected_offsets, pre_ids,
+      pre_scores, ids, scores, seq_offset_start, seq_offset_end, seq_width,
+      beam_size, end_id, is_accumulated, num_used_threads);
 }
 
 static inline int GetNumUsedThreads(const int max_threads_per_seq,
@@ -300,8 +302,9 @@ class BeamSearchFunctor<platform::CUDADeviceContext, T> {
                   const framework::LoDTensor* ids,
                   const framework::LoDTensor* scores,
                   framework::LoDTensor* selected_ids,
-                  framework::LoDTensor* selected_scores, size_t level,
-                  size_t beam_size, int end_id, bool is_accumulated) {
+                  framework::LoDTensor* selected_scores,
+                  framework::Tensor* parent_idx, size_t level, size_t beam_size,
+                  int end_id, bool is_accumulated) {
     auto abs_lod = framework::ToAbsOffset(scores->lod());
 
     const int64_t* pre_ids_data = pre_ids->data<int64_t>();
@@ -322,6 +325,8 @@ class BeamSearchFunctor<platform::CUDADeviceContext, T> {
         selected_ids->mutable_data<int64_t>(selected_dims, context.GetPlace());
     float* selected_scores_data =
         selected_scores->mutable_data<float>(selected_dims, context.GetPlace());
+    int* parent_idx_data = parent_idx->mutable_data<int>(
+        {static_cast<int64_t>(num_seqs * beam_size)}, context.GetPlace());
 
     framework::LoD selected_lod(2);
     selected_lod[0].assign(abs_lod[level].begin(), abs_lod[level].end());
@@ -339,9 +344,9 @@ class BeamSearchFunctor<platform::CUDADeviceContext, T> {
         CUDA_LAUNCH_KERNEL_HELPER(
             BeamSearchKernelSingle<kPowerOfTwoDim, kMaxThreadsPerSeq><<<
                 1, kMaxThreadsPerSeq, 0, context.stream()>>>(
-                selected_ids_data, selected_scores_data, selected_offsets,
-                pre_ids_data, pre_scores_data, ids_data, scores_data,
-                seq_length, static_cast<int>(seq_width),
+                selected_ids_data, selected_scores_data, parent_idx_data,
+                selected_offsets, pre_ids_data, pre_scores_data, ids_data,
+                scores_data, seq_length, static_cast<int>(seq_width),
                 static_cast<int>(beam_size), static_cast<int>(end_id),
                 is_accumulated, num_used_threads));
       }
@@ -357,9 +362,9 @@ class BeamSearchFunctor<platform::CUDADeviceContext, T> {
         CUDA_LAUNCH_KERNEL_HELPER(
             BeamSearchKernel<kPowerOfTwoDim, kMaxThreadsPerSeq, kMaxSeqs><<<
                 1, num_seqs * kMaxThreadsPerSeq, 0, context.stream()>>>(
-                selected_ids_data, selected_scores_data, selected_offsets,
-                pre_ids_data, pre_scores_data, ids_data, scores_data,
-                seq_offsets, static_cast<int>(num_seqs),
+                selected_ids_data, selected_scores_data, parent_idx_data,
+                selected_offsets, pre_ids_data, pre_scores_data, ids_data,
+                scores_data, seq_offsets, static_cast<int>(num_seqs),
                 static_cast<int>(seq_width), static_cast<int>(beam_size),
                 end_id, is_accumulated, num_used_threads));
       }
@@ -379,6 +384,7 @@ class BeamSearchFunctor<platform::CUDADeviceContext, T> {
           {static_cast<int64_t>(selected_lod[1].back()), 1});
       selected_ids->Resize(final_selected_dims);
       selected_scores->Resize(final_selected_dims);
+      parent_idx->Resize({static_cast<int64_t>(selected_lod[1].back())});
     }
   }
 };
diff --git a/paddle/fluid/operators/math/beam_search.h b/paddle/fluid/operators/math/beam_search.h
index 3cd17f426c5596582c91f2b3f0cc5ba513e3aa4b..4474e7ea52affed792572d02202ec2577c471e50 100644
--- a/paddle/fluid/operators/math/beam_search.h
+++ b/paddle/fluid/operators/math/beam_search.h
@@ -104,14 +104,12 @@ class BeamSearchFunctor {
    * Return false if all the input tensor is empty, in machine translation task
    * that means no candidates is provided, and the task will stop running.
    */
-  void operator()(const DeviceContext& context,
-                  const framework::LoDTensor* pre_ids,
-                  const framework::LoDTensor* pre_scores,
-                  const framework::LoDTensor* ids,
-                  const framework::LoDTensor* scores,
-                  framework::LoDTensor* selected_ids,
-                  framework::LoDTensor* selected_scores, size_t level,
-                  size_t beam_size, int end_id, bool is_accumulated);
+  void operator()(
+      const DeviceContext& context, const framework::LoDTensor* pre_ids,
+      const framework::LoDTensor* pre_scores, const framework::LoDTensor* ids,
+      const framework::LoDTensor* scores, framework::LoDTensor* selected_ids,
+      framework::LoDTensor* selected_scores, framework::Tensor* parent_idx,
+      size_t level, size_t beam_size, int end_id, bool is_accumulated);
 };
 
 }  // namespace math
diff --git a/paddle/fluid/operators/math/beam_search_test.cc b/paddle/fluid/operators/math/beam_search_test.cc
index 1c29ee95f6b109209316e4e8c8f3cda37eac62ae..7ea8eb8b00db328ca13d3d33d751aca4eac66dae 100644
--- a/paddle/fluid/operators/math/beam_search_test.cc
+++ b/paddle/fluid/operators/math/beam_search_test.cc
@@ -93,13 +93,14 @@ void TestBeamSearch() {
 
   paddle::framework::LoDTensor selected_ids;
   paddle::framework::LoDTensor selected_scores;
+  paddle::framework::LoDTensor parent_idx;
 
   size_t level = 0;
   size_t beam_size = 2;
   int end_id = 0;
   paddle::operators::math::BeamSearchFunctor<DeviceContext, float> beamsearch;
   beamsearch(*context, &pre_ids, &pre_scores, &ids, &scores, &selected_ids,
-             &selected_scores, level, beam_size, end_id, true);
+             &selected_scores, &parent_idx, level, beam_size, end_id, true);
 
   ASSERT_EQ(selected_ids.lod(), selected_scores.lod());
 
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 0dbcf442a3b098f60ec803152f23f4e65970d948..0e4b5aadc0b0d7e87ea1cfb8e18339fe211e1eef 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -3877,7 +3877,8 @@ def beam_search(pre_ids,
                 end_id,
                 level=0,
                 is_accumulated=True,
-                name=None):
+                name=None,
+                return_parent_idx=False):
     """
     Beam search is a classical algorithm for selecting candidate words in a
     machine translation task.
@@ -3933,10 +3934,16 @@ def beam_search(pre_ids,
              accumulated scores.
         name(str|None): A name for this layer(optional). If set None, the layer
                         will be named automatically.
+        return_parent_idx(bool): Whether to return an extra Tensor variable 
+                        preserving the selected_ids' parent indice in pre_ids
+                        in output, which can be used to gather cell states at
+                        the next time step.
 
     Returns:
-        Variable: The LodTensor pair containing the selected ids and the \
-            corresponding scores.
+        Variable: The LodTensor tuple containing the selected ids and the \
+            corresponding scores. If :attr:`return_parent_idx` is :attr:`True`, \
+            an extra Tensor variable preserving the selected_ids' parent indice \
+            is included.
 
     Examples:
         .. code-block:: python
@@ -3969,6 +3976,11 @@ def beam_search(pre_ids,
     selected_scores = helper.create_variable_for_type_inference(
         dtype=score_type)
     selected_ids = helper.create_variable_for_type_inference(dtype=id_type)
+    # parent_idx is a tensor used to gather cell states at the next time
+    # step. Though lod in selected_ids can also be used to gather by
+    # sequence_expand, it is not efficient.
+    # gather_op's index input only supports int32 dtype currently
+    parent_idx = helper.create_variable_for_type_inference(dtype="int32")
 
     helper.append_op(
         type='beam_search',
@@ -3976,6 +3988,7 @@ def beam_search(pre_ids,
         outputs={
             'selected_ids': selected_ids,
             'selected_scores': selected_scores,
+            'parent_idx': parent_idx
         },
         attrs={
             # TODO(ChunweiYan) to assure other value support
@@ -3984,8 +3997,10 @@ def beam_search(pre_ids,
             'end_id': end_id,
             'is_accumulated': is_accumulated,
         })
-
-    return selected_ids, selected_scores
+    if return_parent_idx:
+        return selected_ids, selected_scores, parent_idx
+    else:
+        return selected_ids, selected_scores
 
 
 def beam_search_decode(ids, scores, beam_size, end_id, name=None):
diff --git a/python/paddle/fluid/tests/unittests/test_beam_search_op.py b/python/paddle/fluid/tests/unittests/test_beam_search_op.py
index c28dda4b53ce5d394ff11222e5df8d257b4e80da..1d9f4b78f30fefa21c189036c3731e0afe39ea9e 100644
--- a/python/paddle/fluid/tests/unittests/test_beam_search_op.py
+++ b/python/paddle/fluid/tests/unittests/test_beam_search_op.py
@@ -38,6 +38,7 @@ class BeamSearchOpTester(unittest.TestCase):
         self._create_pre_ids()
         self.scope.var('selected_ids')
         self.scope.var('selected_scores')
+        self.scope.var('parent_idx')
 
     def test_run(self):
         op = Operator(
@@ -48,12 +49,14 @@ class BeamSearchOpTester(unittest.TestCase):
             scores='scores',
             selected_ids='selected_ids',
             selected_scores='selected_scores',
+            parent_idx='parent_idx',
             level=0,
             beam_size=2,
             end_id=0, )
         op.run(self.scope, core.CPUPlace())
         selected_ids = self.scope.find_var("selected_ids").get_tensor()
         selected_scores = self.scope.find_var("selected_scores").get_tensor()
+        parent_idx = self.scope.find_var("parent_idx").get_tensor()
         self.assertTrue(
             np.allclose(
                 np.array(selected_ids), np.array([4, 2, 3, 8])[:, np.newaxis]))
@@ -62,6 +65,8 @@ class BeamSearchOpTester(unittest.TestCase):
                 np.array(selected_scores),
                 np.array([0.5, 0.6, 0.9, 0.7])[:, np.newaxis]))
         self.assertEqual(selected_ids.lod(), [[0, 2, 4], [0, 1, 2, 3, 4]])
+        self.assertTrue(
+            np.allclose(np.array(parent_idx), np.array([0, 1, 2, 3])))
 
     def _create_pre_ids(self):
         np_data = np.array([[1, 2, 3, 4]], dtype='int64')