diff --git a/doc/api/v2/fluid/layers.rst b/doc/api/v2/fluid/layers.rst
index 62c154e65dcff1bdfb00109bf1b724c34731652e..290d4098798f837d966a7a9dd0dea2565d50b870 100644
--- a/doc/api/v2/fluid/layers.rst
+++ b/doc/api/v2/fluid/layers.rst
@@ -493,3 +493,8 @@ swish
 ------
 ..  autofunction:: paddle.v2.fluid.layers.swish
     :noindex:
+
+l2_normalize
+------------
+..  autofunction:: paddle.v2.fluid.layers.l2_normalize
+    :noindex:
diff --git a/paddle/operators/clip_op.cc b/paddle/operators/clip_op.cc
index 573bb9c7dfdac2366c2458dd9f27a035a9f9b813..7adb74eab78dcdd0251b8db60781f6e24e348634 100644
--- a/paddle/operators/clip_op.cc
+++ b/paddle/operators/clip_op.cc
@@ -51,8 +51,8 @@ class ClipOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
 Clip Operator.
 
-The clip operator limits the value of given input within an interval. The interval is
-specified with arguments 'min' and 'max':
+The clip operator limits the value of given input within an interval. The
+interval is specified with arguments 'min' and 'max':
 
 $$
 Out = \min(\max(X, min), max)
diff --git a/paddle/operators/elementwise_op.h b/paddle/operators/elementwise_op.h
index a342595b546bfca1a344cf8a549597df6a29adec..1a0131d8b943da3deebd0c461f78cb02b34e6dc2 100644
--- a/paddle/operators/elementwise_op.h
+++ b/paddle/operators/elementwise_op.h
@@ -26,9 +26,9 @@ class ElementwiseOp : public framework::OperatorWithKernel {
   using Tensor = framework::Tensor;
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of elementwise op should not be null");
+                   "Input(X) of elementwise op should not be null.");
     PADDLE_ENFORCE(ctx->HasInput("Y"),
-                   "Input(Y) of elementwise op should not be null");
+                   "Input(Y) of elementwise op should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
                    "Output(Out) of elementwise op should not be null.");
 
@@ -45,12 +45,12 @@ class ElementwiseOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   ElementwiseOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "(Tensor) The first input tensor of elementwise op");
-    AddInput("Y", "(Tensor) The second input tensor of elementwise op");
-    AddOutput("Out", "The output of elementwise op");
+    AddInput("X", "(Tensor), The first input tensor of elementwise op.");
+    AddInput("Y", "(Tensor), The second input tensor of elementwise op.");
+    AddOutput("Out", "The output of elementwise op.");
     AddAttr<int>("axis",
-                 "(int, default -1) The starting dimension index "
-                 "for broadcasting Y onto X")
+                 "(int, default -1). The start dimension index "
+                 "for broadcasting Y onto X.")
         .SetDefault(-1)
         .EqualGreaterThan(-1);
     comment_ = R"DOC(
@@ -58,19 +58,18 @@ Limited Elementwise {name} Operator.
 
 The equation is:
 
-.. math::
-  {equation}
+$${equation}$$
 
-X is a tensor of any dimension and the dimensions of tensor Y must be smaller than
-or equal to the dimensions of X. 
+$X$ is a tensor of any dimension and the dimensions of tensor $Y$ must be
+smaller than or equal to the dimensions of $X$.
 
 There are two cases for this operator:
-1. The shape of Y is same with X;
-2. The shape of Y is a subset of X.
+1. The shape of $Y$ is same with $X$;
+2. The shape of $Y$ is a subset of $X$.
 
 For case 2:
-Y will be broadcasted to match the shape of X and axis should be 
-the starting dimension index for broadcasting Y onto X.
+$Y$ will be broadcasted to match the shape of $X$ and axis should be
+set to index of the start dimension to broadcast $Y$ onto $X$.
 
 For example
   .. code-block:: python
@@ -81,7 +80,8 @@ For example
     shape(X) = (2, 3, 4, 5), shape(Y) = (3, 4), with axis=1
     shape(X) = (2, 3, 4, 5), shape(Y) = (2), with axis=0
 
-Either of the inputs X and Y or none can carry the LoD (Level of Details) information. However, the output only shares the LoD information with input X.
+Either of the inputs $X$ and $Y$ or none can carry the LoD (Level of Details)
+information. However, the output only shares the LoD information with input $X$.
 
 )DOC";
     AddComment(comment_);
diff --git a/paddle/operators/expand_op.cc b/paddle/operators/expand_op.cc
index 08fa91ed72aa41ed2f513c090b9085410bb5cc47..043c93654d33f7c105c89960e18ec72d3557237d 100644
--- a/paddle/operators/expand_op.cc
+++ b/paddle/operators/expand_op.cc
@@ -58,21 +58,21 @@ class ExpandOpMaker : public framework::OpProtoAndCheckerMaker {
   ExpandOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
-             "(Tensor, default Tensor<float>) A tensor with rank in [1, 6]."
-             "X is the input tensor to be expanded.");
+             "(Tensor, default Tensor<float>). A tensor with rank in [1, 6]."
+             "X is the input to be expanded.");
     AddOutput("Out",
-              "(Tensor, default Tensor<float>) A tensor with rank in [1, 6]."
-              "The rank of Output(Out) is same as Input(X) except that each "
-              "dimension size of Output(Out) is equal to corresponding "
-              "dimension size of Input(X) multiplying corresponding value of "
-              "Attr(expand_times).");
+              "(Tensor, default Tensor<float>). A tensor with rank in [1, 6]."
+              "The rank of Output(Out) have the same with Input(X). "
+              "After expanding, size of each dimension of Output(Out) is equal "
+              "to size of the corresponding dimension of Input(X) multiplying "
+              "the corresponding value given by Attr(expand_times).");
     AddAttr<std::vector<int>>("expand_times",
                               "Expand times number for each dimension.");
     AddComment(R"DOC(
 Expand operator tiles the input by given times number. You should set times
 number for each dimension by providing attribute 'expand_times'. The rank of X
-should be in [1, 6]. Please notice that size of 'expand_times' must be same with
-X's rank. Following is a using case:
+should be in [1, 6]. Please note that size of 'expand_times' must be the same
+with X's rank. Following is a using case:
 
 Input(X) is a 3-D tensor with shape [2, 3, 1]:
 
diff --git a/python/paddle/trainer_config_helpers/evaluators.py b/python/paddle/trainer_config_helpers/evaluators.py
index 95797fba8f67bacb421f5c2813ad6332bc53cbc9..0eeaf7eabb179f19d2af8dafe821f7baa153fead 100644
--- a/python/paddle/trainer_config_helpers/evaluators.py
+++ b/python/paddle/trainer_config_helpers/evaluators.py
@@ -16,13 +16,22 @@ from paddle.trainer.config_parser import *
 from default_decorators import *
 
 __all__ = [
-    "evaluator_base", "classification_error_evaluator", "auc_evaluator",
-    "pnpair_evaluator", "precision_recall_evaluator", "ctc_error_evaluator",
-    "chunk_evaluator", "sum_evaluator", "column_sum_evaluator",
-    "value_printer_evaluator", "gradient_printer_evaluator",
-    "maxid_printer_evaluator", "maxframe_printer_evaluator",
-    "seqtext_printer_evaluator", "classification_error_printer_evaluator",
-    "detection_map_evaluator"
+    "evaluator_base",
+    "classification_error_evaluator",
+    "auc_evaluator",
+    "pnpair_evaluator",
+    "precision_recall_evaluator",
+    "ctc_error_evaluator",
+    "chunk_evaluator",
+    "sum_evaluator",
+    "column_sum_evaluator",
+    "value_printer_evaluator",
+    "gradient_printer_evaluator",
+    "maxid_printer_evaluator",
+    "maxframe_printer_evaluator",
+    "seqtext_printer_evaluator",
+    "classification_error_printer_evaluator",
+    "detection_map_evaluator",
 ]
 
 
diff --git a/python/paddle/v2/fluid/framework.py b/python/paddle/v2/fluid/framework.py
index 8042febfed7ed7db2f6d1507142b17079aa00fd8..4f8366b64039b2edcd4c273439c87397bdc33595 100644
--- a/python/paddle/v2/fluid/framework.py
+++ b/python/paddle/v2/fluid/framework.py
@@ -116,8 +116,8 @@ def _debug_string_(proto, throw_on_error=True):
     """
     error_fields = list()
     if not proto.IsInitialized(error_fields) and throw_on_error:
-        raise ValueError("{0} are not initialized\nThe message is {1}".format(
-            error_fields, proto))
+        raise ValueError("{0} are not initialized.\nThe message is {1}:\n".
+                         format(error_fields, proto))
     return proto.__str__()
 
 
@@ -374,12 +374,13 @@ class Operator(object):
         >>>                     outputs={"Out": [var1]})
 
         Args:
-            block(Block): The block has the current operator
-            desc(core.OpDesc): The protobuf description
+            block(Block): The block has the current operator.
+            desc(core.OpDesc): The protobuf description.
             type(str): The type of operator.
             inputs(dict): The input dictionary. Key is the input parameter name.
                 Value is a list of variables.
-            outputs(dict): The output dictionary. Has same format with inputs
+            outputs(dict): The output dictionary which has the same format with
+                           inputs.
             attrs(dict): The attributes dictionary. Key is attribute name. Value
                 is the attribute value. The attribute type should be as same as
                 the type registered in C++
@@ -436,10 +437,11 @@ class Operator(object):
             for m in proto.outputs:
                 need.add(m.name)
             if not given == need:
-                raise ValueError(
-                    "Incorrect setting for output(s) of operator \"%s\". Need: [%s] Given: [%s]"
-                    % (type, ", ".join(str(e) for e in need), ", ".join(
-                        str(e) for e in given)))
+                raise ValueError(("Incorrect setting for output(s) of "
+                                  "operator \"%s\". Need: [%s] Given: [%s]") %
+                                 (type, ", ".join(str(e)
+                                                  for e in need), ", ".join(
+                                                      str(e) for e in given)))
 
             for out_proto in proto.outputs:
                 out_args = outputs[out_proto.name]
@@ -818,9 +820,8 @@ class Program(object):
                 if isinstance(t, Variable):
                     t = t.op
                 else:
-                    raise ValueError(
-                        "All targets of prune() can only be Variable or Operator."
-                    )
+                    raise ValueError(("All targets of prune() can only be "
+                                      "Variable or Operator."))
 
             targets_idx.append([t.block.idx, t.idx])
         res = Program()
diff --git a/python/paddle/v2/fluid/layers/io.py b/python/paddle/v2/fluid/layers/io.py
index 6177f0b4d71cdda8637fc40e3e65c72842bf7439..a43e0ee4def668bf7033f37cfa1a3f59d10a88d0 100644
--- a/python/paddle/v2/fluid/layers/io.py
+++ b/python/paddle/v2/fluid/layers/io.py
@@ -28,9 +28,9 @@ def data(name,
     **Data Layer**
 
     This function takes in the input and based on whether data has
-    to be returned back as a minibatch, it creates the global variable using
+    to be returned back as a minibatch, it creates the global variable by using
     the helper functions. The global variables can be accessed by all the
-    following operations and layers in the graph.
+    following operators in the graph.
 
     All the input variables of this function are passed in as local variables
     to the LayerHelper constructor.
diff --git a/python/paddle/v2/fluid/layers/nn.py b/python/paddle/v2/fluid/layers/nn.py
index 4e8fd407c9983e2827d3137fa4f49a8425d5dce2..cfa60d2924a6af4fb8081e2b90853e262d95ff0e 100644
--- a/python/paddle/v2/fluid/layers/nn.py
+++ b/python/paddle/v2/fluid/layers/nn.py
@@ -50,6 +50,7 @@ __all__ = [
     'sequence_last_step',
     'dropout',
     'split',
+    'l2_normalize',
 ]
 
 
@@ -945,7 +946,8 @@ def pool2d(input,
            pool_type,
            pool_stride=None,
            pool_padding=None,
-           global_pooling=False):
+           global_pooling=False,
+           name=None):
     """
     This function adds the operator for pooling in 2 dimensions, using the
     pooling configurations mentioned in input parameters.
@@ -991,7 +993,8 @@ def batch_norm(input,
                epsilon=1e-05,
                param_attr=None,
                bias_attr=None,
-               data_layout='NCHW'):
+               data_layout='NCHW',
+               name=None):
     """
     This function helps create an operator to implement
     the BatchNorm layer using the configurations from the input parameters.
@@ -1067,7 +1070,7 @@ def batch_norm(input,
     return helper.append_activation(batch_norm_out)
 
 
-def beam_search_decode(ids, scores):
+def beam_search_decode(ids, scores, name=None):
     helper = LayerHelper('beam_search_decode', **locals())
     sentence_ids = helper.create_tmp_variable(dtype=ids.dtype)
     sentence_scores = helper.create_tmp_variable(dtype=ids.dtype)
@@ -1091,7 +1094,8 @@ def conv2d_transpose(input,
                      padding=None,
                      stride=None,
                      dilation=None,
-                     param_attr=None):
+                     param_attr=None,
+                     name=None):
     """
     The transpose of conv2d layer.
 
@@ -1118,8 +1122,8 @@ def conv2d_transpose(input,
             contain two integers, (dilation_H, dilation_W). Otherwise, the
             dilation_H = dilation_W = dilation.
         param_attr: Parameter Attribute.
-        main_program(Program): the main program
-        startup_program(Program): the startup program
+        name(str|None): A name for this layer(optional). If set None, the layer
+                       will be named automatically.
 
     Returns:
         Variable: Output image.
@@ -1183,7 +1187,7 @@ def conv2d_transpose(input,
     return out
 
 
-def sequence_expand(x, y):
+def sequence_expand(x, y, name=None):
     """Sequence Expand Layer. This layer will expand the input variable **x**
     according to LoD information of **y**. And the following examples will
     explain how sequence_expand works:
@@ -1227,6 +1231,8 @@ def sequence_expand(x, y):
     Args:
         x (Variable): The input variable which is a Tensor or LoDTensor.
         y (Variable): The input variable which is a LoDTensor.
+        name(str|None): A name for this layer(optional). If set None, the layer
+                       will be named automatically.
 
     Returns:
         Variable: The expanded variable which is a LoDTensor.
@@ -1253,7 +1259,8 @@ def lstm_unit(x_t,
               cell_t_prev,
               forget_bias=0.0,
               param_attr=None,
-              bias_attr=None):
+              bias_attr=None,
+              name=None):
     """Lstm unit layer. The equation of a lstm step is:
 
         .. math::
@@ -1300,6 +1307,8 @@ def lstm_unit(x_t,
             initializer, name etc.
         bias_attr (ParamAttr): The attributes of bias weights, if not False,
             bias weights will be created and be set to default value.
+        name(str|None): A name for this layer(optional). If set None, the layer
+                       will be named automatically.
 
     Returns:
         tuple: The hidden value and cell value of lstm unit.
@@ -1365,7 +1374,7 @@ def lstm_unit(x_t,
     return h, c
 
 
-def reduce_sum(input, dim=None, keep_dim=False):
+def reduce_sum(input, dim=None, keep_dim=False, name=None):
     """
     Computes the sum of tensor elements over the given dimension.
 
@@ -1379,6 +1388,8 @@ def reduce_sum(input, dim=None, keep_dim=False):
         keep_dim (bool): Whether to reserve the reduced dimension in the
             output Tensor. The result tensor will have one fewer dimension
             than the :attr:`input` unless :attr:`keep_dim` is true.
+        name(str|None): A name for this layer(optional). If set None, the layer
+                       will be named automatically.
 
     Returns:
         Variable: The reduced Tensor variable.
@@ -1409,7 +1420,7 @@ def reduce_sum(input, dim=None, keep_dim=False):
     return out
 
 
-def reduce_mean(input, dim=None, keep_dim=False):
+def reduce_mean(input, dim=None, keep_dim=False, name=None):
     """
     Computes the mean of tensor elements over the given dimension.
 
@@ -1423,6 +1434,8 @@ def reduce_mean(input, dim=None, keep_dim=False):
         keep_dim (bool): Whether to reserve the reduced dimension in the
             output Tensor. The result tensor will have one fewer dimension
             than the :attr:`input` unless :attr:`keep_dim` is true.
+        name(str|None): A name for this layer(optional). If set None, the layer
+                       will be named automatically.
 
     Returns:
         Variable: The reduced Tensor variable.
@@ -1453,7 +1466,7 @@ def reduce_mean(input, dim=None, keep_dim=False):
     return out
 
 
-def reduce_max(input, dim=None, keep_dim=False):
+def reduce_max(input, dim=None, keep_dim=False, name=None):
     """
     Computes the maximum of tensor elements over the given dimension.
 
@@ -1467,6 +1480,8 @@ def reduce_max(input, dim=None, keep_dim=False):
         keep_dim (bool): Whether to reserve the reduced dimension in the
             output Tensor. The result tensor will have one fewer dimension
             than the :attr:`input` unless :attr:`keep_dim` is true.
+        name(str|None): A name for this layer(optional). If set None, the layer
+                       will be named automatically.
 
     Returns:
         Variable: The reduced Tensor variable.
@@ -1497,7 +1512,7 @@ def reduce_max(input, dim=None, keep_dim=False):
     return out
 
 
-def reduce_min(input, dim=None, keep_dim=False):
+def reduce_min(input, dim=None, keep_dim=False, name=None):
     """
     Computes the minimum of tensor elements over the given dimension.
 
@@ -1511,6 +1526,8 @@ def reduce_min(input, dim=None, keep_dim=False):
         keep_dim (bool): Whether to reserve the reduced dimension in the
             output Tensor. The result tensor will have one fewer dimension
             than the :attr:`input` unless :attr:`keep_dim` is true.
+        name(str|None): A name for this layer(optional). If set None, the layer
+                       will be named automatically.
 
     Returns:
         Variable: The reduced Tensor variable.
@@ -1541,20 +1558,22 @@ def reduce_min(input, dim=None, keep_dim=False):
     return out
 
 
-def split(input, num_or_sections, dim=-1):
+def split(input, num_or_sections, dim=-1, name=None):
     """
-    Splits the tensor into multiple sub-tensors.
+    Split the input tensor into multiple sub-tensors.
 
     Args:
         input (Variable): The input variable which is a Tensor or LoDTensor.
-        num_or_sections (int|list): If :attr:`num_or_sections` is an integer, 
-            then the integer indicates the number of equal sized sub-tensors 
-            that the tensor will be divided into. If :attr:`num_or_sections` 
-            is a list of integers, the length of list indicates the number of 
-            sub-tensors and the integers indicate the sizes of sub-tensors' 
+        num_or_sections (int|list): If :attr:`num_or_sections` is an integer,
+            then the integer indicates the number of equal sized sub-tensors
+            that the tensor will be divided into. If :attr:`num_or_sections`
+            is a list of integers, the length of list indicates the number of
+            sub-tensors and the integers indicate the sizes of sub-tensors'
             :attr:`dim` dimension orderly.
-        dim (int): The dimension along which to split. If :math:`dim < 0`, the 
+        dim (int): The dimension along which to split. If :math:`dim < 0`, the
             dimension to split along is :math:`rank(input) + dim`.
+        name(str|None): A name for this layer(optional). If set None, the layer
+                       will be named automatically.
 
     Returns:
         List: The list of segmented tensor variables.
@@ -1597,3 +1616,84 @@ def split(input, num_or_sections, dim=-1):
             'axis': dim
         })
     return outs
+
+
+def l2_normalize(x, axis, epsilon=1e-12, name=None):
+    """
+    **L2 normalize Layer**
+
+    The l2 normalize layer normalizes `x` along dimension `axis` using an L2
+    norm. For a 1-D tensor (`dim` is fixed to 0), this layer computes
+
+    output = x / sqrt(max(sum(x**2), epsilon))
+
+    For `x` with more dimensions, this layer independently normalizes each 1-D
+    slice along dimension `axis`.
+
+    Args:
+       x(Variable|list): The input tensor to l2_normalize layer.
+       axis(int): Dimension along which to normalize the input.
+       epsilon(float): A lower bound value for `x`'s l2 norm. sqrt(epsilon) will
+                       be used as the divisor if the l2 norm of `x` is less than
+                       sqrt(epsilon).
+       name(str|None): A name for this layer(optional). If set None, the layer
+                       will be named automatically.
+
+
+    Returns:
+        Variable: The output tensor variable.
+
+    Examples:
+        .. code-block:: python
+
+          data = fluid.layers.data(name="data",
+                                   shape=(3, 17, 13),
+                                   dtype="float32")
+          fc = fluid.layers.l2_normalize(x=data, axis=1)
+    """
+
+    if len(x.shape) == 1: axis = 0
+
+    helper = LayerHelper("l2_normalize", **locals())
+
+    square = helper.create_tmp_variable(dtype=x.dtype)
+    helper.append_op(type="square", inputs={"X": x}, outputs={"Out": square})
+
+    reduced_sum = helper.create_tmp_variable(dtype=x.dtype)
+    helper.append_op(
+        type="reduce_sum",
+        inputs={"X": square},
+        outputs={"Out": reduced_sum},
+        attrs={
+            "dim": 1 if axis is None else axis,
+            "keep_dim": True,
+            "reduce_all": False
+        })
+
+    # TODO(caoying) A lower bound value epsilon for the norm is needed to
+    # imporve the numeric stability of reciprocal. This requires a maximum_op.
+    rsquare = helper.create_tmp_variable(dtype=x.dtype)
+    helper.append_op(
+        type="reciprocal", inputs={"X": reduced_sum}, outputs={"Out": rsquare})
+
+    # TODO(caoying) the current elementwise_mul operator does not support a
+    # general broadcast rule which broadcasts input(Y) to have the same
+    # dimension with Input(X) starting from a specified dimension. So this
+    # exanpsion is requred. Once a general broadcast relu is spported, this
+    # expanding canbe removed.
+    rsquare_expanded = helper.create_tmp_variable(dtype=x.dtype)
+    expand_times = [1] * len(x.shape)
+    expand_times[axis] = int(x.shape[axis])
+    helper.append_op(
+        type="expand",
+        inputs={"X": rsquare},
+        outputs={"Out": rsquare_expanded},
+        attrs={"expand_times": expand_times})
+
+    out = helper.create_tmp_variable(dtype=x.dtype)
+    helper.append_op(
+        type="elementwise_mul",
+        inputs={"X": x,
+                "Y": rsquare_expanded},
+        outputs={"Out": out})
+    return out
diff --git a/python/paddle/v2/fluid/tests/test_normalization_wrapper.py b/python/paddle/v2/fluid/tests/test_normalization_wrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..caff63011d038d785472cb38a26a51f3f4cc9288
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_normalization_wrapper.py
@@ -0,0 +1,95 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+import unittest
+import paddle.v2.fluid as fluid
+import paddle.v2.fluid.core as core
+import numpy as np
+
+
+class TestNormalization(unittest.TestCase):
+    data_desc = {"name": "input", "shape": (2, 3, 7)}
+
+    def gen_random_input(self):
+        """Generate random input data.
+        """
+        self.data = np.random.random(
+            size=self.data_desc["shape"]).astype("float32")
+
+    def set_program(self, axis, epsilon):
+        """Build the test program.
+        """
+        data = fluid.layers.data(
+            name=self.data_desc["name"],
+            shape=self.data_desc["shape"],
+            dtype="float32",
+            append_batch_size=False)
+        data.stop_gradient = False
+        l2_norm = fluid.layers.l2_normalize(x=data, axis=axis, epsilon=epsilon)
+        out = fluid.layers.reduce_sum(l2_norm, dim=None)
+
+        fluid.backward.append_backward(loss=out)
+        self.fetch_list = [l2_norm]
+
+    def run_program(self):
+        """Run the test program.
+        """
+        places = [core.CPUPlace()]
+        if core.is_compile_gpu():
+            places.append(core.CUDAPlace(0))
+
+        for place in places:
+            self.set_inputs(place)
+            exe = fluid.Executor(place)
+
+            output = exe.run(fluid.default_main_program(),
+                             feed=self.inputs,
+                             fetch_list=self.fetch_list,
+                             return_numpy=True)
+            self.op_output = output
+
+    def set_inputs(self, place):
+        """Set the randomly generated data to the test program.
+        """
+        self.inputs = {}
+        tensor = fluid.Tensor()
+        tensor.set(self.data, place)
+        self.inputs[self.data_desc["name"]] = tensor
+
+    def l2_normalize(self, data, axis, epsilon):
+        """ Compute the groundtruth.
+        """
+        output = data * np.reciprocal(
+            np.sum(np.square(data), axis=axis, keepdims=True))
+        return output
+
+    def test_l2_normalize(self):
+        """ Test the python wrapper for l2_normalize.
+        """
+        axis = 1
+        #TODO(caoying) epsilon is not supported due to lack of a maximum_op.
+        epsilon = 1e-6
+
+        self.gen_random_input()
+
+        self.set_program(axis, epsilon)
+        self.run_program()
+
+        expect_output = self.l2_normalize(self.data, axis, epsilon)
+
+        # check output
+        self.assertTrue(np.allclose(self.op_output, expect_output, atol=0.001))
+
+
+if __name__ == '__main__':
+    unittest.main()