diff --git a/python/paddle/trainer_config_helpers/networks.py b/python/paddle/trainer_config_helpers/networks.py
index 0ecbacb7bbc302449faed86665d6eb38e65f48e7..120c9d11a5ebaa72b94590e596fd4362c552f979 100644
--- a/python/paddle/trainer_config_helpers/networks.py
+++ b/python/paddle/trainer_config_helpers/networks.py
@@ -1400,13 +1400,13 @@ def simple_attention(encoded_sequence,
 
 @wrap_name_default()
 def dot_product_attention(encoded_sequence,
-                          attending_sequence,
+                          attended_sequence,
                           transformed_state,
                           softmax_param_attr=None,
                           name=None):
     """
     Calculate and return a context vector with dot-product attention mechanism.
-    Size of the context vector equals to size of the attending_sequence.
+    The dimension of the context vector equals to that of the attended_sequence.
 
     ..  math::
 
@@ -1419,35 +1419,38 @@ def dot_product_attention(encoded_sequence,
         c_{i} & = \\sum_{j=1}^{T_{x}}a_{i,j}z_{j}
 
     where :math:`h_{j}` is the jth element of encoded_sequence,
-    :math:`z_{j}` is the jth element of attending_sequence,
-    :math:`s_{i-1}` is transformed_state
+    :math:`z_{j}` is the jth element of attended_sequence,
+    :math:`s_{i-1}` is transformed_state.
 
     The example usage is:
 
     ..  code-block:: python
 
         context = dot_product_attention(encoded_sequence=enc_seq,
-                                        attending_sequence=att_seq,
+                                        attended_sequence=att_seq,
                                         transformed_state=state,)
 
-    :param name: name of the dot-product attention model.
+    :param name: A prefix attached to the name of each layer that defined inside
+                 the dot_product_attention.
     :type name: basestring
-    :param softmax_param_attr: parameter attribute of sequence softmax
+    :param softmax_param_attr: The parameter attribute of sequence softmax
                                that is used to produce attention weight.
     :type softmax_param_attr: ParameterAttribute
-    :param encoded_sequence: output of the encoder
+    :param encoded_sequence: The output hidden vectors of the encoder.
     :type encoded_sequence: LayerOutput
-    :param attending_sequence: attention weight is computed by a feed forward neural
-                               network which has two inputs : decoder's transformed
-                               hidden state of previous time step and encoder's output.
-                               attending_sequence is the sequence to be attended.
-    :type attending_sequence: LayerOutput
-    :param transformed_state: transformed hidden state of decoder in previous time step,
-                              its size should equal to encoded_sequence's. Here we do the
-                              transformation outside dot_product_attention for flexibility
-                              consideration.
+    :param attended_sequence: The attention weight is computed by a feed forward neural
+                              network which has two inputs : decoder's transformed hidden
+                              state of previous time step and encoder's output.
+                              attended_sequence is the sequence to be attended.
+    :type attended_sequence: LayerOutput
+    :param transformed_state: The transformed hidden state of decoder in previous time step.
+                              Since the dot-product operation will be performed on it and the
+                              encoded_sequence, their dimensions must be equal. For flexibility,
+                              we suppose transformations of the decoder's hidden state have been
+                              done outside dot_product_attention and no more will be performed
+                              inside. Then users can use either the original or transformed one.
     :type transformed_state: LayerOutput
-    :return: a context vector
+    :return: The context vector.
     :rtype: LayerOutput
     """
     assert transformed_state.size == encoded_sequence.size
@@ -1470,7 +1473,7 @@ def dot_product_attention(encoded_sequence,
 
     scaled = scaling_layer(
         weight=attention_weight,
-        input=attending_sequence,
+        input=attended_sequence,
         name='%s_scaling' % name)
 
     return pooling_layer(