diff --git a/python/paddle/v2/fluid/layers/nn.py b/python/paddle/v2/fluid/layers/nn.py
index fc2be70d10ef3cbb4af8533513a24439a70e981f..3181a04e9dae385863218806f892574e514e720e 100644
--- a/python/paddle/v2/fluid/layers/nn.py
+++ b/python/paddle/v2/fluid/layers/nn.py
@@ -1968,7 +1968,7 @@ def l2_normalize(x, axis, epsilon=1e-12, name=None):
           data = fluid.layers.data(name="data",
                                    shape=(3, 17, 13),
                                    dtype="float32")
-          fc = fluid.layers.l2_normalize(x=data, axis=1)
+          normed = fluid.layers.l2_normalize(x=data, axis=1)
     """
 
     if len(x.shape) == 1: axis = 0
diff --git a/python/paddle/v2/fluid/nets.py b/python/paddle/v2/fluid/nets.py
index ac7e7c937716429d01eabd998f2db961dca7c5e3..bfc034a1113f1dea21be05064777e356852be47d 100644
--- a/python/paddle/v2/fluid/nets.py
+++ b/python/paddle/v2/fluid/nets.py
@@ -182,28 +182,28 @@ def scaled_dot_product_attention(queries,
     Refer to `Attention Is All You Need
     <https://arxiv.org/pdf/1706.03762.pdf>`_.
 
-    Note that batch data containing sequences with different lengths is not
-    supported by this because of the (batch) matrix multipication.
-
+    Args:
 
-        queries (Variable): The input variable which is a Tensor or
-                            LoDTensor.
-        keys (Variable): The input variable which is a Tensor or LoDTensor.
-        values (Variable): The input variable which is a Tensor or
-                           LoDTensor.
-        num_heads (int): Head number to compute the dot product attention.
-        dropout_rate (float): The dropout rate for attention weight.
+        queries (Variable): The input variable which should be a 3-D Tensor.
+        keys (Variable): The input variable which should be a 3-D Tensor.
+        values (Variable): The input variable which should be a 3-D Tensor.
+        num_heads (int): Head number to compute the scaled dot product
+                         attention. Default value is 1.
+        dropout_rate (float): The dropout rate to drop the attention weight.
+                              Default value is 0.
 
     Returns:
-        Variable: The context Tensor computed by multi-head scaled dot product
+
+        Variable: A 3-D Tensor computed by multi-head scaled dot product
                   attention.
 
     Examples:
         .. code-block:: python
 
-            # Suppose q, k, v are tensor variables with the following
-            # shape: q: [3, 5, 9], k: [3, 6, 9], v: [3, 6, 10]
-            out, attn_scores = fluid.nets.dot_product_attention(q, k, v)
+            # Suppose q, k, v are Tensors with the following shape:
+            # q: [3, 5, 9], k: [3, 6, 9], v: [3, 6, 10]
+
+            contexts = fluid.nets.dot_product_attention(q, k, v)
             out.shape  # [3, 5, 10]
             attn_scores.shape  # [3, 5, 6]
     """
@@ -227,19 +227,30 @@ def scaled_dot_product_attention(queries,
                          "by the number of attention heads (%d)." %
                          (values.shape[-1], num_heads))
 
+    def __compute_qkv(queries, keys, values, num_heads):
+        if num_heads == 1:
+            return queries, keys, values
+
+        q = layers.fc(input=queries, size=queries.shape[-1], num_flatten_dims=2)
+        k = layers.fc(input=keys, size=keys.shape[-1], num_flatten_dims=2)
+        v = layers.fc(input=values, size=values.shape[-1], num_flatten_dims=2)
+        return q, k, v
+
     def __split_heads(x, num_heads):
         """
         Reshape the last dimension of inpunt tensor x so that it becomes two
         dimensions.
 
         Args:
-          x(Tensor): a 3-D input Tensor.
-          num_heads(int): The number of heads.
+            x(Tensor): a 3-D input Tensor.
+            num_heads(int): The number of heads.
 
         Returns:
-          a Tensor with shape [..., n, m/n]
+            Tensor: a Tensor with shape [..., n, m/num_heads], where m is size
+                    of the last dimension of x.
         """
-        if num_heads == 1: return x
+        if num_heads == 1:
+            return x
 
         hidden_size = x.shape[-1]
         # reshape the 3-D input: [batch_size, max_sequence_length, hidden_dim]
@@ -254,6 +265,19 @@ def scaled_dot_product_attention(queries,
         return layers.transpose(x=reshaped, perm=[0, 2, 1, 3])
 
     def __combine_heads(x):
+        """
+        Reshape the last two dimensions of inpunt tensor x so that it becomes
+        one dimension.
+
+        Args:
+            x(Tensor): a 4-D input Tensor with shape
+                       [bs, num_heads, max_sequence_length, hidden_dim].
+
+        Returns:
+            Tensor: a Tensor with shape
+                    [bs, max_sequence_length, num_heads * hidden_dim].
+        """
+
         if len(x.shape) == 3: return
         if len(x.shape) != 4:
             raise ValueError("Input(x) should be a 4-D Tensor.")
@@ -266,9 +290,11 @@ def scaled_dot_product_attention(queries,
                 trans_x.shape[2] * trans_x.shape[3]
             ]))
 
-    q = __split_heads(queries, num_heads)
-    k = __split_heads(keys, num_heads)
-    v = __split_heads(values, num_heads)
+    q, k, v = __compute_qkv(queries, keys, values, num_heads)
+
+    q = __split_heads(q, num_heads)
+    k = __split_heads(k, num_heads)
+    v = __split_heads(v, num_heads)
 
     key_dim_per_head = keys.shape[-1] // num_heads
     scaled_q = layers.scale(x=q, scale=key_dim_per_head**-0.5)