diff --git a/paddle/gserver/layers/CrossEntropyOverBeam.cpp b/paddle/gserver/layers/CrossEntropyOverBeam.cpp
index 09258fb30599014c6746ad20f0caeed9ff1692f5..f7736f0ce905f88d69598e2cef8e825fbec7de70 100644
--- a/paddle/gserver/layers/CrossEntropyOverBeam.cpp
+++ b/paddle/gserver/layers/CrossEntropyOverBeam.cpp
@@ -161,7 +161,17 @@ real CostForOneSequence::forward() {
 }
 
 void CostForOneSequence::backward() {
+  /*
+   * when softmax layer is the output layer, and it is combined with
+   * cross-entropy as cost. The derivate with regard to softmax's input
+   * is simply:
+   *
+   * grad_i = softmax_out_i - target_i,
+   *
+   * and here hard label is used.
+   */
   softmaxOut_->getData()[goldIdsInFinalExpansion_] -= 1.;
+
   MatrixPtr tmp = Matrix::create(
       softmaxOut_->getData(), softmaxOut_->getWidth(), 1, false, false);
 
diff --git a/paddle/gserver/layers/CrossEntropyOverBeam.h b/paddle/gserver/layers/CrossEntropyOverBeam.h
index 96a5df7dfbe46108ef04d23857d155305ad40b56..5d0cffee3c159702a2d6b96de710553b0ede9f6a 100644
--- a/paddle/gserver/layers/CrossEntropyOverBeam.h
+++ b/paddle/gserver/layers/CrossEntropyOverBeam.h
@@ -19,8 +19,8 @@ limitations under the License. */
 
 namespace paddle {
 
+/* This struct stores the beams in all search steps for a single sequence. */
 struct BeamExpansion {
-  // store the entire beam expansion for a single sequence
   std::vector<MatrixPtr> scores;
   std::vector<IVectorPtr> seqInfo;
 
@@ -111,8 +111,11 @@ private:
   size_t batchSize_;
   size_t beamSize_;
 
-  // Currently, this layer only works on CPU, if its inputs is on GPU,
-  // copy them to CPU memory.
+  /*
+   * the process of constructing beams is not friendly to GPU, currently, this
+   * layer only runs on CPU, if any of its inputs is on GPU memory, then copy
+   * it to CPU memory.
+   */
   std::vector<MatrixPtr> candidateScores_;
   std::vector<MatrixPtr> candidateScoreGrad_;
   std::vector<MatrixPtr> candidateInBeam_;
@@ -120,9 +123,12 @@ private:
   std::vector<IVectorPtr> goldSequence_;
   std::vector<std::vector<int>> beamSplitPos_;
 
-  // split entire bath of beams into beam per sequnence.
+  /*
+   * split entire bath of beams into beam per sequnence and store the result
+   * into this member.
+   */
   std::vector<BeamExpansion> beamPerSeq_;
-  // beamCosts_ is used to propagate error in one sequence.
+  /* beamCosts_ is used to propagate error in one sequence. */
   std::vector<CostForOneSequence> beamCosts_;
 };
 
diff --git a/paddle/gserver/tests/test_CrossEntropyOverBeamGrad.cpp b/paddle/gserver/tests/test_CrossEntropyOverBeamGrad.cpp
index 506a4281df4f0a2cead2c83c4754cfb4226b8b80..538d18cdc3d262df0ddb031d9e6b38a3fea57606 100644
--- a/paddle/gserver/tests/test_CrossEntropyOverBeamGrad.cpp
+++ b/paddle/gserver/tests/test_CrossEntropyOverBeamGrad.cpp
@@ -28,16 +28,10 @@ using namespace paddle;  // NOLINT
 DECLARE_int32(gpu_id);
 DECLARE_bool(thread_local_rand_use_global_seed);
 
-// const size_t MAX_SEQ_NUM = 5;
-// const size_t MAX_SEQ_LEN = 10;
-// const size_t MAX_BEAM_SIZE = 3;
-
 const size_t MAX_SEQ_NUM = 23;
 const size_t MAX_SEQ_LEN = 50;
 const size_t MAX_BEAM_SIZE = 27;
 
-// const size_t SEED = 1503391792;
-// const size_t SEED = 1;
 const size_t SEED = (size_t)(time(NULL));
 
 struct SingleBeamExpansion {
@@ -176,10 +170,12 @@ void genGroundTruth(vector<SingleBeamExpansion>& beamExpansions,
   beam.resetGroundTruth(seqNum);
   for (size_t i = 0; i < seqNum; ++i) {
     if (randFloat() > 0.5) {
-      // force the randomly generated label falls in the beam by chance 0.5.
-      // otherwise, when sequence length is relatively long and beam size is
-      // relatively small, the gold sequences falls off the beam at in
-      // the first search.
+      /*
+       * force the randomly generated label falls in the beam by chance 0.5.
+       * otherwise, when sequence length is relatively long and beam size is
+       * relatively small, the gold sequences falls off the beam at in the
+       * first search.
+       */
       real* begPos = beam.selectedIndices.data() + i * beamSize;
       beam.colIdxInBeam[i] =
           rand() % count_if(begPos, begPos + beamSize, [](const real& val) {
@@ -222,9 +218,7 @@ void genGroundTruth(vector<SingleBeamExpansion>& beamExpansions,
 
       if (randFloat() > 0.5) {
         // force the randomly generated label falls in the beam by chance 0.5.
-        // otherwise, when sequence length is relatively long and beam size is
-        // relatively small, the gold sequences falls off the beam at in
-        // the first search.
+
         real* start =
             curBeam.selectedIndices.data() + curBeam.rowIdxInBeam[j] * beamSize;
         int n = rand() % count_if(start, start + beamSize, [](const real& val) {
@@ -339,7 +333,7 @@ TEST(Layer, CrossEntropyOverBeam) {
   const size_t beamSize = 1 + rand() % MAX_BEAM_SIZE;
   LOG(INFO) << "beamSize = " << beamSize;
 
-  // TODO(caoying): test with more beam expansions.
+  // TODO(caoying): test with random beam expansions.
   const size_t expansionCount = 3;
   vector<SingleBeamExpansion> beams;
   genRandomBeamExpansion(expansionCount, beamSize, beams);
diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index 7707ece819c9e684e13730e21c8d8c64649e2710..579713546f15e25f5f67979038211aea041b7a92 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -1605,16 +1605,16 @@ class MultiClassCrossEntropySelfNormCostLayer(LayerBase):
 @config_layer('cross_entropy_over_beam')
 class CrossEntropyOverBeamLayer(LayerBase):
     def __init__(self, name, inputs, **xargs):
-        config_assert(len(inputs) % 3 == 0, "Error input numbers.")
+        config_assert(len(inputs) % 3 == 0, "Error input number.")
         super(CrossEntropyOverBeamLayer, self).__init__(
             name, 'cross_entropy_over_beam', 0, inputs, **xargs)
         input_num = len(inputs) / 3
         for i in range(input_num):
-            input_layer = self.get_input_layer(i * 2)
-            config_assert(
-                input_layer.size == 1, "Inputs for this layer are made up of "
-                "several pairs and the first one in a pair is scores for "
-                "all the candidates, so its size should be equal to 1.")
+            input_layer = self.get_input_layer(i * 3)
+            config_assert(input_layer.size == 1, (
+                "Inputs for this layer are made up of "
+                "several triples, in which the first one is scores over "
+                "all candidate paths, whose size should be equal to 1."))
 
 
 @config_layer('fc')
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index b027f84b5d576103b6e03ef6709a6c1f335aabe2..053c92d005f7a7929e7cef35ce0cf7ad62efa760 100755
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -103,6 +103,7 @@ __all__ = [
     'nce_layer',
     'cross_entropy_with_selfnorm',
     'cross_entropy',
+    'BeamInput',
     'cross_entropy_over_beam',
     'multi_binary_label_cross_entropy',
     'sum_cost',
@@ -5681,10 +5682,10 @@ def multi_binary_label_cross_entropy(input,
 
     if input.activation is None or \
             not isinstance(input.activation, SigmoidActivation):
-        logger.log(
-            logging.WARN,
-            "%s is not recommend for multi_binary_label_cross_entropy's activation, "
-            "maybe the sigmoid is better" % repr(input.activation))
+        logger.log(logging.WARN,
+                   ("%s is not a recommended activation for "
+                    "multi_binary_label_cross_entropy, sigmoid is better") %
+                   repr(input.activation))
 
     Layer(
         name=name,
@@ -5699,26 +5700,110 @@ def multi_binary_label_cross_entropy(input,
         size=1)
 
 
+class BeamInput(object):
+    """
+    Define the input for cross_entropy_over_beam layer.
+
+    A beam is made up of a triple: the first one is scores over all
+    candidates; the second one is indices of top k selected candidates; the
+    third one is the index of ground truth, which is also always called
+    gold.
+    """
+
+    def __init__(self, candidate_scores, selected_candidates, gold):
+        assert isinstance(candidate_scores, LayerOutput)
+        self.candidate_scores = candidate_scores
+        assert candidate_scores.size == 1
+
+        assert isinstance(selected_candidates, LayerOutput)
+        self.selected_candidates = selected_candidates
+
+        assert isinstance(gold, LayerOutput)
+        self.gold = gold
+
+
 @wrap_name_default()
 @layer_support()
-def cross_entropy_over_beam(input, label, name=None, coeff=1.0, weight=None):
-    """
-    TODO(caoying) add comments.
+def cross_entropy_over_beam(input, name=None):
     """
+    This layer is used in learning to search models, which is to solve complex
+    joint prediction problems based on learning to search through a
+    problem-defined search space.
 
-    assert len(input) / 2 == len(label), "Error input numbers."
-    for i in range(0, len(input), 2):
-        assert (input[i].size == 1), (
-            "Inputs for this layer are made up of "
-            "several pairs and the first one in a pair is scores for "
-            "all the candidates, so its size should be equal to 1.")
+    Specifically, the learning to search process for this layer begins with
+    searching a target sequence from a nested sequence. In the first search
+    step, top beam size sequences with highest scores, indices of these top k
+    sequences in the original nested sequence, and the ground truth (also
+    called gold) altogether (a triple) make up of the first beam.
 
-    ipts, parents = __cost_input__(input, label, weight)
-    Layer(
-        name=name,
-        type=LayerType.CROSS_ENTROPY_OVER_BEAM,
-        inputs=ipts,
-        coeff=coeff)
+    Then, several special positions, for example, start and end positions
+    that define meaningful segments are searched. In these searches, top k
+    positions with highest scores are selected, and then sequence, starting
+    from the selected starts till ends of the sequences (or a fixed position)
+    are taken to search next.
+
+    We call the possible top k results returned in one search the beam. This
+    search process can be repeated for pre-defined turns and leads to several
+    beam expansions.
+
+    Finally, the layer cross_entropy_over_beam takes all the beam expansions
+    which contain several candidate targets found along the multi-step search.
+    cross_entropy_over_beam calculates cross entropy over the expanded beams
+    which all the candidates in the beam as the normalized factor.
+
+    Note that, if gold falls off the beam at search step t, then the cost is
+    calculated over the beam at step t.
+
+    This cost layer always works together with kmax_sequence_score_layer,
+    sub_nested_seq_layer, and sequence_slice_layer to trim the input to form a
+    sub-search space.
+
+
+    The example usage is:
+
+    .. code-block:: python
+
+       cost = cross_entropy_over_beam(input=[
+           BeamInput(
+               candidate_scores=beam1_candidates,
+               selected_candidates=beam1_topk,
+               gold=gold1),
+           BeamInput(
+               candidate_scores=beam2_candidates,
+               selected_candidates=beam2_topk,
+               gold=gold2),
+       ])
+
+
+    :param input: input beams for this layer.
+    :type input: BeamInput
+    :param name: input beams for this layer.
+    :type name: basestring
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+
+    if isinstance(input, BeamInput):
+        input = [input]
+    else:
+        assert isinstance(input, list), (
+            'input for cross_entropy_over_beam shold be a python list '
+            'of BeamInput object.')
+        for ipt in input:
+            assert isinstance(ipt, BeamInput), (
+                'input for cross_entropy_over_beam '
+                'should be a BeamInput object.')
+
+    ipts = []
+    parents = []
+    for beam in input:
+        parents += [beam.candidate_scores, beam.selected_candidates, beam.gold]
+        ipts += [
+            beam.candidate_scores.name, beam.selected_candidates.name,
+            beam.gold.name
+        ]
+
+    Layer(name=name, type=LayerType.CROSS_ENTROPY_OVER_BEAM, inputs=ipts)
     return LayerOutput(name, LayerType.CROSS_ENTROPY, parents=parents, size=1)
 
 
@@ -6247,11 +6332,11 @@ def kmax_sequence_score_layer(input, name=None, beam_size=1):
 @wrap_bias_attr_default()
 def scale_shift_layer(input, name=None, param_attr=None, bias_attr=None):
     """
-    A layer applies a linear transformation to each element in each row of 
-    the input matrix. For each element, the layer first re-scale it and then 
+    A layer applies a linear transformation to each element in each row of
+    the input matrix. For each element, the layer first re-scale it and then
     adds a bias to it.
 
-    This layer is very like the SlopeInterceptLayer, except the scale and 
+    This layer is very like the SlopeInterceptLayer, except the scale and
     bias are trainable.
 
     .. math::
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cross_entropy_over_beam.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cross_entropy_over_beam.protostr
index e44478ec2ba1fbbcc935f418540441f99fda6d4e..c43fc48e222044b65d83b6162e7dc3954e119887 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cross_entropy_over_beam.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cross_entropy_over_beam.protostr
@@ -114,27 +114,26 @@ layers {
     input_layer_name: "__kmax_sequence_score_layer_0__"
   }
   inputs {
-    input_layer_name: "__fc_layer_0__"
+    input_layer_name: "sentences_ids"
   }
   inputs {
-    input_layer_name: "__kmax_sequence_score_layer_1__"
+    input_layer_name: "__fc_layer_0__"
   }
   inputs {
-    input_layer_name: "__fc_layer_1__"
+    input_layer_name: "__kmax_sequence_score_layer_1__"
   }
   inputs {
-    input_layer_name: "__kmax_sequence_score_layer_2__"
+    input_layer_name: "start_ids"
   }
   inputs {
-    input_layer_name: "sentences_ids"
+    input_layer_name: "__fc_layer_1__"
   }
   inputs {
-    input_layer_name: "start_ids"
+    input_layer_name: "__kmax_sequence_score_layer_2__"
   }
   inputs {
     input_layer_name: "end_ids"
   }
-  coeff: 1.0
 }
 parameters {
   name: "___fc_layer_0__.w0"
@@ -177,8 +176,8 @@ parameters {
   initial_smart: false
 }
 input_layer_names: "sentence_scores"
-input_layer_names: "sentence_states"
 input_layer_names: "sentences_ids"
+input_layer_names: "sentence_states"
 input_layer_names: "start_ids"
 input_layer_names: "end_ids"
 output_layer_names: "__cross_entropy_over_beam_0__"
@@ -198,8 +197,8 @@ sub_models {
   layer_names: "end_ids"
   layer_names: "__cross_entropy_over_beam_0__"
   input_layer_names: "sentence_scores"
-  input_layer_names: "sentence_states"
   input_layer_names: "sentences_ids"
+  input_layer_names: "sentence_states"
   input_layer_names: "start_ids"
   input_layer_names: "end_ids"
   output_layer_names: "__cross_entropy_over_beam_0__"
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_cross_entropy_over_beam.py b/python/paddle/trainer_config_helpers/tests/configs/test_cross_entropy_over_beam.py
index edc2d32fca1c911ad72277b5175578565443b783..240e703dc904e718c2c1ddaf2b6d7dccb4dabf41 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_cross_entropy_over_beam.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_cross_entropy_over_beam.py
@@ -29,11 +29,17 @@ topk_end_pos_ids = kmax_sequence_score_layer(
 sentence_idx = data_layer(name="sentences_ids", size=1)
 start_idx = data_layer(name="start_ids", size=1)
 end_idx = data_layer(name="end_ids", size=1)
-cost = cross_entropy_over_beam(
-    input=[
-        sentence_scores, topk_sentence_ids, start_pos_scores,
-        topk_start_pos_ids, end_pos_scores, topk_end_pos_ids
-    ],
-    label=[sentence_idx, start_idx, end_idx])
+cost = cross_entropy_over_beam(input=[
+    BeamInput(
+        candidate_scores=sentence_scores,
+        selected_candidates=topk_sentence_ids,
+        gold=sentence_idx), BeamInput(
+            candidate_scores=start_pos_scores,
+            selected_candidates=topk_start_pos_ids,
+            gold=start_idx), BeamInput(
+                candidate_scores=end_pos_scores,
+                selected_candidates=topk_end_pos_ids,
+                gold=end_idx)
+])
 
 outputs(cost)