diff --git a/doc/api/v2/config/networks.rst b/doc/api/v2/config/networks.rst
index 6e813ab1a820d068ea3e54cad6178f1cf928eadc..048379cf01f4aec5e73e2fe3ddfa728f3c17a5d1 100644
--- a/doc/api/v2/config/networks.rst
+++ b/doc/api/v2/config/networks.rst
@@ -125,3 +125,8 @@ simple_attention
     :members: simple_attention
     :noindex:
 
+dot_product_attention
+---------------------
+..  automodule:: paddle.v2.networks
+    :members: dot_product_attention
+    :noindex:
diff --git a/python/paddle/trainer_config_helpers/networks.py b/python/paddle/trainer_config_helpers/networks.py
index 93e8ac173e721d9623fce91f30ac4642d273caba..0ecbacb7bbc302449faed86665d6eb38e65f48e7 100644
--- a/python/paddle/trainer_config_helpers/networks.py
+++ b/python/paddle/trainer_config_helpers/networks.py
@@ -26,8 +26,9 @@ __all__ = [
     'sequence_conv_pool', 'simple_lstm', "simple_img_conv_pool",
     "img_conv_bn_pool", 'lstmemory_group', 'lstmemory_unit', 'small_vgg',
     'img_conv_group', 'vgg_16_network', 'gru_unit', 'gru_group', 'simple_gru',
-    'simple_attention', 'simple_gru2', 'bidirectional_gru', 'text_conv_pool',
-    'bidirectional_lstm', 'inputs', 'outputs'
+    'simple_attention', 'dot_product_attention', 'simple_gru2',
+    'bidirectional_gru', 'text_conv_pool', 'bidirectional_lstm', 'inputs',
+    'outputs'
 ]
 
 ######################################################
@@ -1361,6 +1362,7 @@ def simple_attention(encoded_sequence,
                                 compute attention weight.
     :type transform_param_attr: ParameterAttribute
     :return: a context vector
+    :rtype: LayerOutput
     """
     assert encoded_proj.size == decoder_state.size
     proj_size = encoded_proj.size
@@ -1396,6 +1398,85 @@ def simple_attention(encoded_sequence,
         input=scaled, pooling_type=SumPooling(), name="%s_pooling" % name)
 
 
+@wrap_name_default()
+def dot_product_attention(encoded_sequence,
+                          attending_sequence,
+                          transformed_state,
+                          softmax_param_attr=None,
+                          name=None):
+    """
+    Calculate and return a context vector with dot-product attention mechanism.
+    Size of the context vector equals to size of the attending_sequence.
+
+    ..  math::
+
+        a(s_{i-1},h_{j}) & = s_{i-1}^\mathrm{T} h_{j}
+
+        e_{i,j} & = a(s_{i-1}, h_{j})
+
+        a_{i,j} & = \\frac{exp(e_{i,j})}{\\sum_{k=1}^{T_x}{exp(e_{i,k})}}
+
+        c_{i} & = \\sum_{j=1}^{T_{x}}a_{i,j}z_{j}
+
+    where :math:`h_{j}` is the jth element of encoded_sequence,
+    :math:`z_{j}` is the jth element of attending_sequence,
+    :math:`s_{i-1}` is transformed_state
+
+    The example usage is:
+
+    ..  code-block:: python
+
+        context = dot_product_attention(encoded_sequence=enc_seq,
+                                        attending_sequence=att_seq,
+                                        transformed_state=state,)
+
+    :param name: name of the dot-product attention model.
+    :type name: basestring
+    :param softmax_param_attr: parameter attribute of sequence softmax
+                               that is used to produce attention weight.
+    :type softmax_param_attr: ParameterAttribute
+    :param encoded_sequence: output of the encoder
+    :type encoded_sequence: LayerOutput
+    :param attending_sequence: attention weight is computed by a feed forward neural
+                               network which has two inputs : decoder's transformed
+                               hidden state of previous time step and encoder's output.
+                               attending_sequence is the sequence to be attended.
+    :type attending_sequence: LayerOutput
+    :param transformed_state: transformed hidden state of decoder in previous time step,
+                              its size should equal to encoded_sequence's. Here we do the
+                              transformation outside dot_product_attention for flexibility
+                              consideration.
+    :type transformed_state: LayerOutput
+    :return: a context vector
+    :rtype: LayerOutput
+    """
+    assert transformed_state.size == encoded_sequence.size
+
+    expanded = expand_layer(
+        input=transformed_state,
+        expanded_as=encoded_sequence,
+        name='%s_expand' % name)
+
+    m = linear_comb_layer(
+        weights=expanded, vectors=encoded_sequence, name='%s_dot-product')
+
+    attention_weight = fc_layer(
+        input=m,
+        size=1,
+        act=SequenceSoftmaxActivation(),
+        param_attr=softmax_param_attr,
+        name="%s_softmax" % name,
+        bias_attr=False)
+
+    scaled = scaling_layer(
+        weight=attention_weight,
+        input=attending_sequence,
+        name='%s_scaling' % name)
+
+    return pooling_layer(
+        input=scaled, pooling_type=SumPooling(), name="%s_pooling" % name)
+
+
 def inputs(layers, *args):
     """
     Declare the inputs of network. The order of input should be as same as