diff --git a/python/paddle/v2/fluid/tests/test_layer_norm_op.py b/python/paddle/v2/fluid/tests/test_layer_norm_op.py
index d27d1d8138cd1aa5f6da3b7a1df04e5b1e706653..ac94dfb92ae191440a416eac466a7d5716b69c13 100644
--- a/python/paddle/v2/fluid/tests/test_layer_norm_op.py
+++ b/python/paddle/v2/fluid/tests/test_layer_norm_op.py
@@ -21,29 +21,19 @@ from paddle.v2.fluid.op import Operator
 from paddle.v2.fluid.framework import grad_var_name
 
 
-def get_backward_op(scope, op, no_grad_set):
-    backward_op = core.Operator.backward(op, no_grad_set)
-    for input in backward_op.input_vars():
-        var = scope.var(input)
-        var.get_tensor()
-    for output in backward_op.output_vars():
-        var = scope.var(output)
-        var.get_tensor()
-    return backward_op
-
-
 def _reference_layer_norm_naive(x, scale, beta, epsilon, begin_norm_axis=1):
-    old_shape = x.shape
-    N = reduce(mul, old_shape[0:begin_norm_axis], 1)
-    D = reduce(mul, old_shape[begin_norm_axis:len(old_shape)], 1)
+    x_shape = x.shape
+    N = reduce(mul, x_shape[0:begin_norm_axis], 1)
+    D = reduce(mul, x_shape[begin_norm_axis:len(x_shape)], 1)
     x.shape = [N, D]
+
     mean = np.mean(x, axis=1)
     var = np.var(x, axis=1) + epsilon
     output = scale.reshape([1, D]) * np.divide(
         (x - mean.reshape([N, 1])),
         (np.sqrt(var)).reshape([N, 1])) + beta.reshape([1, D])
-    output.shape = old_shape
-    x.shape = old_shape
+
+    x.shape, output.shape = x_shape, x_shape
     return output, mean, var
 
 
@@ -52,27 +42,25 @@ def _reference_layer_norm_grad(x, grad_y, scale, mean, var, begin_norm_axis=1):
     scale_shape = scale.shape
     N = reduce(mul, x_shape[0:begin_norm_axis], 1)
     D = reduce(mul, x_shape[begin_norm_axis:len(x_shape)], 1)
-    grad_y.shape = [N, D]
-    x.shape = [N, D]
-    mean.shape = [N, 1]
-    var.shape = [N, 1]
+    x.shape, grad_y.shape = [N, D], [N, D]
+    var.shape, mean.shape = [N, 1], [N, 1]
     scale.shape = [1, D]
 
+    # d_bias
     d_bias = np.sum(grad_y, axis=0).reshape([1, D])
+    # d_scale
     d_scale = np.sum(((x - mean) * np.sqrt(1 / var)) * grad_y,
                      axis=0).reshape([1, D])
-
+    # dx
     dx_end = scale * np.sqrt(1.0 / var) * grad_y
-
     d_mean_0 = np.sum(-np.sqrt(1.0 / var) * grad_y * scale, axis=1).reshape(
         [N, 1])
     # d_mean_1 = np.sum(-1.0 / var * (x - mean) * grad_y, axis=1).reshape(
     #     [N, 1]) * (-1.0 / D * np.sqrt(1.0 / var) *
     #                np.sum(x - mean, axis=1).reshape([N, 1])).reshape([N, 1])
     d_mean = 1.0 / D * d_mean_0
-
     d_std = np.sum(
-        -1.0 / var * (x - mean) * grad_y * scale, axis=1).reshape([N, 1]) * (
+        -(1.0 / var) * (x - mean) * grad_y * scale, axis=1).reshape([N, 1]) * (
             1.0 / D * np.sqrt(1.0 / var).reshape([N, 1]) * (x - mean))
 
     grad_x = dx_end + d_mean + d_std
@@ -83,6 +71,17 @@ def _reference_layer_norm_grad(x, grad_y, scale, mean, var, begin_norm_axis=1):
     return grad_x, d_scale, d_bias
 
 
+def get_backward_op(scope, op, no_grad_set):
+    backward_op = core.Operator.backward(op, no_grad_set)
+    for input in backward_op.input_vars():
+        var = scope.var(input)
+        var.get_tensor()
+    for output in backward_op.output_vars():
+        var = scope.var(output)
+        var.get_tensor()
+    return backward_op
+
+
 def create_or_get_tensor(scope, var_name, var, place):
     tensor = scope.var(var_name).get_tensor()
     if var is not None:
@@ -145,8 +144,9 @@ class TestLayerNormdOp(OpTest):
 
         self.assertLessEqual(max_diff, max_relative_error, err_msg())
 
-    def test_forward_backward(self):
+    def check_forward_backward(self, shape, begin_norm_axis):
         def test_with_place(place, shape, begin_norm_axis=1):
+            # setUp
             assert begin_norm_axis > 0 and begin_norm_axis < len(
                 shape), 'begin_norm_axis must be between 0 and len(shape)-1.'
             # attr
@@ -158,30 +158,35 @@ class TestLayerNormdOp(OpTest):
             x_val = np.random.random_sample(x_shape).astype(np.float32)
             scale_val = np.random.random_sample(scale_shape).astype(np.float32)
             bias_val = np.random.random_sample(scale_shape).astype(np.float32)
+            y_grad = np.random.random_sample(x_shape).astype(np.float32)
 
             # run forward
             y_out, saved_mean, var_ref = _reference_layer_norm_naive(
                 x_val, scale_val, bias_val, epsilon, begin_norm_axis)
+            naive_fw = {"Y": y_out, "Mean": saved_mean, "Variance": var_ref}
 
-            #  for gradient test
-            y_grad = np.random.random_sample(x_shape).astype(np.float32)
-
+            # get gradient
             x_grad_ref, scale_grad_ref, bias_grad_ref = _reference_layer_norm_grad(
                 x_val, y_grad, scale_val, saved_mean, var_ref, begin_norm_axis)
+            naive_grad = {
+                "X": x_grad_ref,
+                "Scale": scale_grad_ref,
+                "Bias": bias_grad_ref
+            }
 
             scope = core.Scope()
 
             # create input
-            x_tensor = create_or_get_tensor(scope, "X", x_val, place)
-            scale_tensor = create_or_get_tensor(scope, "Scale", scale_val,
-                                                place)
-            bias_tensor = create_or_get_tensor(scope, "Bias", bias_val, place)
+            input_map = {"X": x_val, "Scale": scale_val, "Bias": bias_val}
+            for i_name in input_map:
+                create_or_get_tensor(scope, i_name, input_map[i_name], place)
 
             # create output
-            y_tensor = create_or_get_tensor(scope, "Y", None, place)
-            mean_tensor = create_or_get_tensor(scope, "Mean", None, place)
-            variance_tensor = create_or_get_tensor(scope, "Variance", None,
-                                                   place)
+            output_map = {"Y": None, "Mean": None, "Variance": None}
+            output_tensor = {}
+            for o_name in output_map:
+                output_tensor[o_name] = create_or_get_tensor(
+                    scope, o_name, output_map[o_name], place)
 
             layer_norm_op = Operator(
                 "layer_norm",
@@ -200,13 +205,10 @@ class TestLayerNormdOp(OpTest):
             layer_norm_op.run(scope, place)
 
             # check forward result
-            if isinstance(place, core.CUDAPlace):
-                atol = 5e-2
-            else:
-                atol = 1e-4
-            self.__assert_close(y_tensor, y_out, "Y", atol)
-            self.__assert_close(mean_tensor, saved_mean, "Mean", atol)
-            self.__assert_close(variance_tensor, var_ref, "Variance", atol)
+            atol = 5e-2 if isinstance(place, core.CUDAPlace) else 1e-4
+            for o_tensor in output_tensor:
+                self.__assert_close(output_tensor[o_tensor], naive_fw[o_tensor],
+                                    o_tensor, atol)
 
             # run backward
             layer_norm_op_grad = get_backward_op(scope, layer_norm_op, set())
@@ -216,30 +218,28 @@ class TestLayerNormdOp(OpTest):
                 feed_dict={"Y": y_grad})
             layer_norm_op_grad.run(scope, place)
 
-            x_grad_tensor = create_or_get_tensor(scope,
-                                                 grad_var_name("X"), None,
-                                                 place)
-            scale_grad_tensor = create_or_get_tensor(scope,
-                                                     grad_var_name("Scale"),
-                                                     None, place)
-            bias_grad_tensor = create_or_get_tensor(scope,
-                                                    grad_var_name("Bias"), None,
-                                                    place)
+            # get output
+            grad_tensor = {}
+            for o_name in naive_grad:
+                grad_tensor[o_name] = x_ = create_or_get_tensor(
+                    scope, grad_var_name(o_name), None, place)
 
             # check gradient output
-            self.__assert_grad_close(x_grad_tensor, x_grad_ref, "x_grad", place)
-            self.__assert_grad_close(scale_grad_tensor, scale_grad_ref,
-                                     "scale_grad", place)
-            self.__assert_grad_close(bias_grad_tensor, bias_grad_ref,
-                                     "bias_grad", place)
+            for o_grad in naive_grad:
+                self.__assert_grad_close(grad_tensor[o_grad],
+                                         naive_grad[o_grad], o_grad + "@GRAD",
+                                         place)
 
         places = [core.CPUPlace()]
         if core.is_compile_gpu() and core.op_support_gpu("layer_norm"):
             places.append(core.CUDAPlace(0))
 
         for place in places:
-            test_with_place(place, [2, 3, 4, 5], begin_norm_axis=1)
-            test_with_place(place, [2, 3, 4, 5], begin_norm_axis=3)
+            test_with_place(place, shape, begin_norm_axis)
+
+    def test_check_forward_backward(self):
+        self.check_forward_backward(shape=[2, 3, 4, 5], begin_norm_axis=1)
+        self.check_forward_backward(shape=[2, 3, 4, 5], begin_norm_axis=3)
 
 
 if __name__ == '__main__':