diff --git a/doc/design/ops/sequence_decoder.md b/doc/design/ops/sequence_decoder.md
index 9db5fb8e9a9f89b004bf71ddc064cd976c0d0bee..c4a9bbeeefca0e05c335dd60233691e8bac33015 100644
--- a/doc/design/ops/sequence_decoder.md
+++ b/doc/design/ops/sequence_decoder.md
@@ -22,7 +22,7 @@ The current `LoDTensor` is designed to store levels of variable-length sequences
 The integers in each level represent the begin and end (not inclusive) offset of a sequence **in the underlying tensor**,
 let's call this format the **absolute-offset LoD** for clarity.
 
-The relative-offset LoD can retrieve any sequence very quickly but fails to represent empty sequences, for example, a two-level LoD is as follows
+The absolute-offset LoD can retrieve any sequence very quickly but fails to represent empty sequences, for example, a two-level LoD is as follows
 ```python
 [[0, 3, 9]
  [0, 2, 3, 3, 3, 9]]
@@ -119,7 +119,7 @@ def generate():
         encoder_ctx_expanded = pd.lod_expand(encoder_ctx, target_word)
         decoder_input = pd.fc(
             act=pd.activation.Linear(),
-            input=[target_word, encoder_ctx],
+            input=[target_word, encoder_ctx_expanded],
             size=3 * decoder_dim)
         gru_out, cur_mem = pd.gru_step(
             decoder_input, mem=decoder_mem, size=decoder_dim)
diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc
index 1382bfca19a674a404916a5c709276ce41219d2f..bd58c0a7f8161f6b45f2b500f3685e4028d97e96 100644
--- a/paddle/framework/executor.cc
+++ b/paddle/framework/executor.cc
@@ -116,8 +116,9 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
 
   for (auto& op_desc : block.AllOps()) {
     auto op = paddle::framework::OpRegistry::CreateOp(*op_desc);
-    VLOG(3) << op->DebugStringEx(local_scope);
+    VLOG(4) << op->DebugStringEx(local_scope);
     op->Run(*local_scope, place_);
+    VLOG(3) << op->DebugStringEx(local_scope);
     if (FLAGS_do_memory_benchmark) {
       VLOG(2) << "Memory used after operator " + op->Type() + " running: "
               << memory::memory_usage(place_);
diff --git a/paddle/framework/lod_tensor.cc b/paddle/framework/lod_tensor.cc
index b29f528f3f749efa3463125c774c2f4d4ebcbc7c..53b0d0fe083579da4f0bb600f292765aa2aa0d8a 100644
--- a/paddle/framework/lod_tensor.cc
+++ b/paddle/framework/lod_tensor.cc
@@ -107,9 +107,10 @@ LoD ToAbsOffset(const LoD &in) {
   // the lowest level stores relative offsets
   if (in.empty() || in.size() == 1) return in;
   LoD result = in;
-  for (int level = result.size() - 2; level >= 0; level--) {
-    for (auto &ele : result[level]) {
-      ele = result[level + 1][ele];
+  for (auto level = static_cast<int>(in.size() - 2); level >= 0; level--) {
+    for (size_t i = 0; i < in[level].size(); ++i) {
+      size_t index = in[level][i];
+      result[level][i] = result[level + 1][index];
     }
   }
   return result;
diff --git a/paddle/operators/beam_search_op.cc b/paddle/operators/beam_search_op.cc
index 4c71d66d22899d2cf6418935bf9358a0f73cec27..844ade40eb2a7ae239b079daa609f03b9e7a06df 100644
--- a/paddle/operators/beam_search_op.cc
+++ b/paddle/operators/beam_search_op.cc
@@ -24,8 +24,18 @@ namespace operators {
 void BeamSearch::operator()(const framework::LoDTensor &pre_ids,
                             framework::LoDTensor *selected_ids,
                             framework::LoDTensor *selected_scores) {
+  auto abs_lod = framework::ToAbsOffset(ids_->lod());
+  auto &high_level = abs_lod[lod_level_];
+
   auto items = SelectTopBeamSizeItems();
-  auto selected_items = ToMap(items);
+  auto selected_items = ToMap(items, high_level.back());
+  VLOG(3) << "selected_items:";
+  for (size_t i = 0; i < selected_items.size(); ++i) {
+    VLOG(3) << "offset:" << i;
+    for (auto &item : selected_items[i]) {
+      VLOG(3) << ItemToString(item);
+    }
+  }
   PruneEndidCandidates(pre_ids, &selected_items);
   // calculate the output tensor's height
   size_t num_instances = std::accumulate(
@@ -63,11 +73,12 @@ void BeamSearch::operator()(const framework::LoDTensor &pre_ids,
   low_level.push_back(low_offset);
 
   // fill lod
-  auto abs_lod = framework::ToAbsOffset(ids_->lod());
-  auto &high_level = abs_lod[lod_level_];
   framework::LoD lod(2);
   lod[0].assign(high_level.begin(), high_level.end());
   lod[1].assign(low_level.begin(), low_level.end());
+  if (!framework::CheckLoD(lod)) {
+    PADDLE_THROW("lod %s is not right", framework::LoDToString(lod));
+  }
   selected_ids->set_lod(lod);
   selected_scores->set_lod(lod);
 }
@@ -90,13 +101,11 @@ int BeamSearch::PruneEndidCandidates(const framework::LoDTensor &pre_ids,
 }
 
 std::vector<std::vector<BeamSearch::Item>> BeamSearch::ToMap(
-    const std::vector<std::vector<Item>> &items) {
+    const std::vector<std::vector<Item>> &items, size_t element_num) {
   std::vector<std::vector<Item>> result;
+  result.resize(element_num);
   for (auto &entries : items) {
     for (const auto &item : entries) {
-      if (item.offset >= result.size()) {
-        result.resize(item.offset + 1);
-      }
       result[item.offset].push_back(item);
     }
   }
@@ -122,6 +131,14 @@ BeamSearch::SelectTopBeamSizeItems() {
     }
     result.emplace_back(items);
   }
+  VLOG(3) << "SelectTopBeamSizeItems result size " << result.size();
+  for (auto &items : result) {
+    VLOG(3) << "item set:";
+    for (auto &item : items) {
+      VLOG(3) << ItemToString(item);
+    }
+  }
+
   return result;
 }
 
@@ -159,6 +176,22 @@ bool BeamSearch::NextItemSet(std::vector<BeamSearch::Item> *items) {
   return true;
 }
 
+std::ostream &operator<<(std::ostream &os, const BeamSearch::Item &item) {
+  os << "{";
+  os << "offset: " << item.offset << ", ";
+  os << "id: " << item.id << ", ";
+  os << "score: " << item.score << "";
+  os << "}";
+
+  return os;
+}
+
+std::string ItemToString(const BeamSearch::Item &item) {
+  std::ostringstream stream;
+  stream << item;
+  return stream.str();
+}
+
 class BeamSearchProtoAndCheckerMaker
     : public framework::OpProtoAndCheckerMaker {
  public:
@@ -186,8 +219,40 @@ class BeamSearchProtoAndCheckerMaker
   }
 };
 
+class BeamSearchInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    for (const std::string &arg :
+         std::vector<std::string>({"pre_ids", "ids", "scores"})) {
+      PADDLE_ENFORCE(context->HasInput(arg),
+                     "BeamSearch need input argument '%s'", arg);
+    }
+    for (const std::string &arg :
+         std::vector<std::string>({"selected_ids", "selected_scores"})) {
+      PADDLE_ENFORCE(context->HasOutput(arg),
+                     "BeamSearch need output argument '%s'", arg);
+    }
+  }
+};
+
+class BeamSearchInferVarType : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc &op_desc,
+                  framework::BlockDesc *block) const override {
+    for (auto &o : op_desc.Output("selected_ids")) {
+      block->Var(o)->SetType(framework::proto::VarDesc::LOD_TENSOR);
+    }
+    for (auto &o : op_desc.Output("selected_scores")) {
+      block->Var(o)->SetType(framework::proto::VarDesc::LOD_TENSOR);
+    }
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
-REGISTER_OP_WITHOUT_GRADIENT(beam_search, paddle::operators::BeamSearchOp,
-                             paddle::operators::BeamSearchProtoAndCheckerMaker);
+REGISTER_OPERATOR(beam_search, paddle::operators::BeamSearchOp,
+                  paddle::operators::BeamSearchProtoAndCheckerMaker,
+                  paddle::operators::BeamSearchInferShape,
+                  paddle::operators::BeamSearchInferVarType,
+                  paddle::framework::EmptyGradOpMaker);
diff --git a/paddle/operators/beam_search_op.h b/paddle/operators/beam_search_op.h
index 45d14d68fe8d1c4a84aa826e68e76692444765a8..7ad85874fcbd6ea48d688b32f2cc982d6b76d3c4 100644
--- a/paddle/operators/beam_search_op.h
+++ b/paddle/operators/beam_search_op.h
@@ -136,8 +136,6 @@ class BeamSearch {
   void operator()(const framework::LoDTensor& pre_ids,
                   framework::LoDTensor* selected_ids,
                   framework::LoDTensor* selected_scores);
-
- protected:
   /*
    * The basic items help to sort.
    */
@@ -155,6 +153,7 @@ class BeamSearch {
     score_t score;
   };
 
+ protected:
   /*
    * Delete all the records that follows the end token.
    */
@@ -166,7 +165,7 @@ class BeamSearch {
    * NOTE low performance
    */
   std::vector<std::vector<Item>> ToMap(
-      const std::vector<std::vector<Item>>& inputs);
+      const std::vector<std::vector<Item>>& inputs, size_t element_num);
 
   /*
    * For each source, select top beam_size records.
@@ -187,6 +186,10 @@ class BeamSearch {
   int end_id_{0};
 };
 
+std::ostream& operator<<(std::ostream& os, const BeamSearch::Item& item);
+
+std::string ItemToString(const BeamSearch::Item& item);
+
 class BeamSearchOp : public framework::OperatorBase {
  public:
   BeamSearchOp(const std::string& type,
@@ -203,7 +206,6 @@ class BeamSearchOp : public framework::OperatorBase {
 
   void Run(const framework::Scope& scope,
            const platform::Place& dev_place) const override {
-    LOG(INFO) << "run beam search op";
     auto ids_var = scope.FindVar(Input("ids"));
     auto scores_var = scope.FindVar(Input("scores"));
     auto pre_ids_var = scope.FindVar(Input("pre_ids"));
@@ -217,10 +219,8 @@ class BeamSearchOp : public framework::OperatorBase {
     size_t level = Attr<int>("level");
     size_t beam_size = Attr<int>("beam_size");
     int end_id = Attr<int>("end_id");
-    LOG(INFO) << "init beam search";
     BeamSearch alg(ids, scores, level, beam_size, end_id);
 
-    LOG(INFO) << "after beam search";
     auto selected_ids_var = scope.FindVar(Output("selected_ids"));
     auto selected_scores_var = scope.FindVar(Output("selected_scores"));
     PADDLE_ENFORCE_NOT_NULL(selected_ids_var);
@@ -229,9 +229,7 @@ class BeamSearchOp : public framework::OperatorBase {
         *selected_ids_var->GetMutable<framework::LoDTensor>();
     auto& selected_scores_tensor =
         *selected_scores_var->GetMutable<framework::LoDTensor>();
-    LOG(INFO) << "run beam search";
     alg(pre_ids, &selected_ids_tensor, &selected_scores_tensor);
-    LOG(INFO) << "finish beam search";
   }
 };
 
diff --git a/paddle/operators/sequence_expand_op.h b/paddle/operators/sequence_expand_op.h
index 2ba628e9c37278025e31779ab0468db46f2ff40a..6021526eee8e0a1f58885f6de38b14048787a828 100644
--- a/paddle/operators/sequence_expand_op.h
+++ b/paddle/operators/sequence_expand_op.h
@@ -32,6 +32,7 @@ class SequenceExpandKernel : public framework::OpKernel<T> {
     const T* x_data = x->data<T>();
     auto x_dims = x->dims();
     auto* y = context.Input<LoDTensor>("Y");
+    PADDLE_ENFORCE(!y->lod().empty(), "y should have lod");
     PADDLE_ENFORCE_EQ(static_cast<size_t>(x_dims[0]),
                       y->lod().back().size() - 1,
                       "The size of last lod level in Input(Y)"
diff --git a/paddle/operators/top_k_op.h b/paddle/operators/top_k_op.h
index e9cd9bbd4d964c28f305fb4ab4c4733ed27ebfff..bf42e15e6b234125d9ec24e8500367b9915213ab 100644
--- a/paddle/operators/top_k_op.h
+++ b/paddle/operators/top_k_op.h
@@ -22,6 +22,7 @@ namespace paddle {
 namespace operators {
 
 using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
 
 template <typename T, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
@@ -33,9 +34,9 @@ class TopkKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     // Get the top k elements of each row of input tensor
     // FIXME: only deal with matrix(2d tensor).
-    auto* input = ctx.Input<Tensor>("X");
-    auto* output = ctx.Output<Tensor>("Out");
-    auto* indices = ctx.Output<Tensor>("Indices");
+    auto* input = ctx.Input<LoDTensor>("X");
+    auto* output = ctx.Output<LoDTensor>("Out");
+    auto* indices = ctx.Output<LoDTensor>("Indices");
     // k is determined by Attr
     const size_t k = static_cast<int>(ctx.Attr<int>("k"));
 
diff --git a/python/paddle/v2/fluid/layer_helper.py b/python/paddle/v2/fluid/layer_helper.py
index 0b0064ade90d2b70dd1458cb4d20d741fbf1efcd..8c481444e9bf895d29ed4e4952e825c2eaafc915 100644
--- a/python/paddle/v2/fluid/layer_helper.py
+++ b/python/paddle/v2/fluid/layer_helper.py
@@ -100,7 +100,8 @@ class LayerHelper(object):
             if dtype is None:
                 dtype = each.dtype
             elif dtype != each.dtype:
-                raise ValueError("Data Type mismatch")
+                raise ValueError("Data Type mismatch: %d to %d" %
+                                 (dtype, each.dtype))
         return dtype
 
     def create_parameter(self,
diff --git a/python/paddle/v2/fluid/layers/control_flow.py b/python/paddle/v2/fluid/layers/control_flow.py
index 5f01fdb076d3bf7d060a805d1431f4973993a843..ed627dac26d286214f27ea43e887c52c394e841d 100644
--- a/python/paddle/v2/fluid/layers/control_flow.py
+++ b/python/paddle/v2/fluid/layers/control_flow.py
@@ -769,7 +769,7 @@ def topk(input, k):
           array = fluid.layers.topk(x, k)
     """
     helper = LayerHelper('topk', **locals())
-    topk_out = helper.create_tmp_variable(dtype=input.data_type)
+    topk_out = helper.create_tmp_variable(dtype=input.dtype)
     topk_indices = helper.create_tmp_variable(dtype='int64')
     helper.append_op(
         type='top_k',
diff --git a/python/paddle/v2/fluid/layers/nn.py b/python/paddle/v2/fluid/layers/nn.py
index 7b9fb024fb2f7b5c7892354bf69138cc83ed1f2c..efad649078de911295a06f52338bb4357cf228cc 100644
--- a/python/paddle/v2/fluid/layers/nn.py
+++ b/python/paddle/v2/fluid/layers/nn.py
@@ -61,6 +61,7 @@ __all__ = [
     'transpose',
     'im2sequence',
     'nce',
+    'beam_search',
 ]
 
 
@@ -163,10 +164,8 @@ def fc(input,
         tmp = helper.create_tmp_variable(dtype)
         helper.append_op(
             type="mul",
-            inputs={
-                "X": input_var,
-                "Y": w,
-            },
+            inputs={"X": input_var,
+                    "Y": w},
             outputs={"Out": tmp},
             attrs={"x_num_col_dims": num_flatten_dims,
                    "y_num_col_dims": 1})
@@ -1551,6 +1550,38 @@ def sequence_expand(x, y, name=None):
     return tmp
 
 
+def beam_search(pre_ids, ids, scores, beam_size, end_id, level=0):
+    '''
+    This function implements the beam search algorithm.
+    '''
+    helper = LayerHelper('beam_search', **locals())
+    score_type = scores.dtype
+    id_type = ids.dtype
+
+    selected_scores = helper.create_tmp_variable(dtype=score_type)
+    selected_ids = helper.create_tmp_variable(dtype=id_type)
+
+    helper.append_op(
+        type='beam_search',
+        inputs={
+            'pre_ids': pre_ids,
+            'ids': ids,
+            'scores': scores,
+        },
+        outputs={
+            'selected_ids': selected_ids,
+            'selected_scores': selected_scores,
+        },
+        attrs={
+            # TODO(ChunweiYan) to assure other value support
+            'level': level,
+            'beam_size': beam_size,
+            'end_id': end_id,
+        })
+
+    return selected_ids, selected_scores
+
+
 def lstm_unit(x_t,
               hidden_t_prev,
               cell_t_prev,
diff --git a/python/paddle/v2/fluid/tests/book/test_machine_translation.py b/python/paddle/v2/fluid/tests/book/test_machine_translation.py
index 53ae200a2387712c63ab67f44d4e9da03ebbe4b2..82b760d693560dae1ab1fa39afdc186f60423e65 100644
--- a/python/paddle/v2/fluid/tests/book/test_machine_translation.py
+++ b/python/paddle/v2/fluid/tests/book/test_machine_translation.py
@@ -17,7 +17,7 @@ import paddle.v2 as paddle
 import paddle.v2.fluid as fluid
 import paddle.v2.fluid.core as core
 import paddle.v2.fluid.framework as framework
-import paddle.v2.fluid.layers as layers
+import paddle.v2.fluid.layers as pd
 from paddle.v2.fluid.executor import Executor
 
 dict_size = 30000
@@ -26,53 +26,136 @@ src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size)
 hidden_dim = 32
 word_dim = 16
 IS_SPARSE = True
-batch_size = 10
-max_length = 50
+batch_size = 2
+max_length = 8
 topk_size = 50
 trg_dic_size = 10000
+beam_size = 2
 
 decoder_size = hidden_dim
 
+place = core.CPUPlace()
 
-def encoder_decoder():
+
+def encoder():
     # encoder
-    src_word_id = layers.data(
+    src_word_id = pd.data(
         name="src_word_id", shape=[1], dtype='int64', lod_level=1)
-    src_embedding = layers.embedding(
+    src_embedding = pd.embedding(
         input=src_word_id,
         size=[dict_size, word_dim],
         dtype='float32',
         is_sparse=IS_SPARSE,
         param_attr=fluid.ParamAttr(name='vemb'))
 
-    fc1 = fluid.layers.fc(input=src_embedding, size=hidden_dim * 4, act='tanh')
-    lstm_hidden0, lstm_0 = layers.dynamic_lstm(input=fc1, size=hidden_dim * 4)
-    encoder_out = layers.sequence_last_step(input=lstm_hidden0)
+    fc1 = pd.fc(input=src_embedding, size=hidden_dim * 4, act='tanh')
+    lstm_hidden0, lstm_0 = pd.dynamic_lstm(input=fc1, size=hidden_dim * 4)
+    encoder_out = pd.sequence_last_step(input=lstm_hidden0)
+    return encoder_out
+
 
+def decoder_train(context):
     # decoder
-    trg_language_word = layers.data(
+    trg_language_word = pd.data(
         name="target_language_word", shape=[1], dtype='int64', lod_level=1)
-    trg_embedding = layers.embedding(
+    trg_embedding = pd.embedding(
         input=trg_language_word,
         size=[dict_size, word_dim],
         dtype='float32',
         is_sparse=IS_SPARSE,
         param_attr=fluid.ParamAttr(name='vemb'))
 
-    rnn = fluid.layers.DynamicRNN()
+    rnn = pd.DynamicRNN()
     with rnn.block():
         current_word = rnn.step_input(trg_embedding)
-        mem = rnn.memory(init=encoder_out)
-        fc1 = fluid.layers.fc(input=[current_word, mem],
+        pre_state = rnn.memory(init=context)
+        current_state = pd.fc(input=[current_word, pre_state],
                               size=decoder_size,
                               act='tanh')
-        out = fluid.layers.fc(input=fc1, size=target_dict_dim, act='softmax')
-        rnn.update_memory(mem, fc1)
-        rnn.output(out)
+
+        current_score = pd.fc(input=current_state,
+                              size=target_dict_dim,
+                              act='softmax')
+        rnn.update_memory(pre_state, current_state)
+        rnn.output(current_score)
 
     return rnn()
 
 
+def decoder_decode(context):
+    init_state = context
+    array_len = pd.fill_constant(shape=[1], dtype='int64', value=max_length)
+    counter = pd.zeros(shape=[1], dtype='int64')
+
+    # fill the first element with init_state
+    state_array = pd.create_array('float32')
+    pd.array_write(init_state, array=state_array, i=counter)
+
+    # ids, scores as memory
+    ids_array = pd.create_array('int64')
+    scores_array = pd.create_array('float32')
+
+    init_ids = pd.data(name="init_ids", shape=[1], dtype="int64", lod_level=2)
+    init_scores = pd.data(
+        name="init_scores", shape=[1], dtype="float32", lod_level=2)
+
+    pd.array_write(init_ids, array=ids_array, i=counter)
+    pd.array_write(init_scores, array=scores_array, i=counter)
+
+    cond = pd.less_than(x=counter, y=array_len)
+
+    while_op = pd.While(cond=cond)
+    with while_op.block():
+        pre_ids = pd.array_read(array=ids_array, i=counter)
+        pre_state = pd.array_read(array=state_array, i=counter)
+        pre_score = pd.array_read(array=scores_array, i=counter)
+
+        # expand the lod of pre_state to be the same with pre_score
+        pre_state_expanded = pd.sequence_expand(pre_state, pre_score)
+
+        pre_ids_emb = pd.embedding(
+            input=pre_ids,
+            size=[dict_size, word_dim],
+            dtype='float32',
+            is_sparse=IS_SPARSE)
+
+        # use rnn unit to update rnn
+        current_state = pd.fc(input=[pre_ids_emb, pre_state_expanded],
+                              size=decoder_size,
+                              act='tanh')
+
+        # use score to do beam search
+        current_score = pd.fc(input=current_state,
+                              size=target_dict_dim,
+                              act='softmax')
+        topk_scores, topk_indices = pd.topk(current_score, k=50)
+        selected_ids, selected_scores = pd.beam_search(
+            pre_ids, topk_indices, topk_scores, beam_size, end_id=10, level=0)
+
+        pd.increment(x=counter, value=1, in_place=True)
+
+        # update the memories
+        pd.array_write(current_state, array=state_array, i=counter)
+        pd.array_write(selected_ids, array=ids_array, i=counter)
+        pd.array_write(selected_scores, array=scores_array, i=counter)
+
+        pd.less_than(x=counter, y=array_len, cond=cond)
+
+    translation_ids, translation_scores = pd.beam_search_decode(
+        ids=ids_array, scores=scores_array)
+
+    # return init_ids, init_scores
+
+    return translation_ids, translation_scores
+
+
+def set_init_lod(data, lod, place):
+    res = core.LoDTensor()
+    res.set(data, place)
+    res.set_lod(lod)
+    return res
+
+
 def to_lodtensor(data, place):
     seq_lens = [len(seq) for seq in data]
     cur_len = 0
@@ -88,12 +171,13 @@ def to_lodtensor(data, place):
     return res
 
 
-def main():
-    rnn_out = encoder_decoder()
-    label = layers.data(
+def train_main():
+    context = encoder()
+    rnn_out = decoder_train(context)
+    label = pd.data(
         name="target_language_next_word", shape=[1], dtype='int64', lod_level=1)
-    cost = layers.cross_entropy(input=rnn_out, label=label)
-    avg_cost = fluid.layers.mean(x=cost)
+    cost = pd.cross_entropy(input=rnn_out, label=label)
+    avg_cost = pd.mean(x=cost)
 
     optimizer = fluid.optimizer.Adagrad(learning_rate=1e-4)
     optimizer.minimize(avg_cost)
@@ -103,13 +187,12 @@ def main():
             paddle.dataset.wmt14.train(dict_size), buf_size=1000),
         batch_size=batch_size)
 
-    place = core.CPUPlace()
     exe = Executor(place)
 
     exe.run(framework.default_startup_program())
 
     batch_id = 0
-    for pass_id in xrange(2):
+    for pass_id in xrange(1):
         for data in train_data():
             word_data = to_lodtensor(map(lambda x: x[0], data), place)
             trg_word = to_lodtensor(map(lambda x: x[1], data), place)
@@ -125,9 +208,48 @@ def main():
             print('pass_id=' + str(pass_id) + ' batch=' + str(batch_id) +
                   " avg_cost=" + str(avg_cost_val))
             if batch_id > 3:
-                exit(0)
+                break
             batch_id += 1
 
 
+def decode_main():
+    context = encoder()
+    translation_ids, translation_scores = decoder_decode(context)
+
+    exe = Executor(place)
+    exe.run(framework.default_startup_program())
+
+    init_ids_data = np.array([1 for _ in range(batch_size)], dtype='int64')
+    init_scores_data = np.array(
+        [1. for _ in range(batch_size)], dtype='float32')
+    init_ids_data = init_ids_data.reshape((batch_size, 1))
+    init_scores_data = init_scores_data.reshape((batch_size, 1))
+    init_lod = [i for i in range(batch_size)] + [batch_size]
+    init_lod = [init_lod, init_lod]
+
+    train_data = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.wmt14.train(dict_size), buf_size=1000),
+        batch_size=batch_size)
+    for _, data in enumerate(train_data()):
+        init_ids = set_init_lod(init_ids_data, init_lod, place)
+        init_scores = set_init_lod(init_scores_data, init_lod, place)
+
+        src_word_data = to_lodtensor(map(lambda x: x[0], data), place)
+
+        result_ids, result_scores = exe.run(
+            framework.default_main_program(),
+            feed={
+                'src_word_id': src_word_data,
+                'init_ids': init_ids,
+                'init_scores': init_scores
+            },
+            fetch_list=[translation_ids, translation_scores],
+            return_numpy=False)
+        print result_ids.lod()
+        break
+
+
 if __name__ == '__main__':
-    main()
+    # train_main()
+    decode_main()