diff --git a/paddle/fluid/operators/elementwise_op_function.h b/paddle/fluid/operators/elementwise_op_function.h
index 415182201a7a9e11d8ea8c62b92849b5ea3bac3e..f0362ec606c994d69f31c7a2e1e9ad0d0108b621 100644
--- a/paddle/fluid/operators/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise_op_function.h
@@ -356,8 +356,8 @@ __device__ T reduceSum(T val, int tid, int len) {
   // I use Warp-Level Parallelism and assume the Warp size
   // is 32 which may be different for different GPU,
   // but most card's warp size is 32.
-  __shared__ T shm[32];
   const int warpSize = 32;
+  __shared__ T shm[warpSize];
   unsigned mask = 0u;
   CREATE_SHFL_MASK(mask, tid < len);
 
@@ -371,6 +371,7 @@ __device__ T reduceSum(T val, int tid, int len) {
   if (tid % warpSize == 0) {
     shm[tid / warpSize] = val;
   }
+  __syncthreads();
 
   CREATE_SHFL_MASK(mask, tid < warpSize);
 
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_gradient_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_gradient_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..c6f45381af8ac64d117eb27325f25763fbf6cae7
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_gradient_op.py
@@ -0,0 +1,103 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+import numpy as np
+
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+
+
+class TestElementWiseAddOp(unittest.TestCase):
+    def __assert_close(self, tensor, np_array, msg, atol=1e-4):
+        self.assertTrue(np.allclose(np.array(tensor), np_array, atol=atol), msg)
+
+    def check_forward_backward(self):
+        def test_with_place(place):
+            out_grad = np.random.random_sample(self.x.shape).astype(np.float32)
+            x_grad = out_grad
+            sum_axis = range(0, len(self.x.shape))
+            del sum_axis[self.axis]
+            y_grad = np.sum(out_grad, axis=tuple(sum_axis))
+
+            var_dict = locals()
+            var_dict['y'] = self.y
+            var_dict['x'] = self.x
+            var_dict['out'] = self.out
+            var_dict['y@GRAD'] = y_grad
+            var_dict['x@GRAD'] = x_grad
+            var_dict['out@GRAD'] = out_grad
+
+            var_names = ['x', 'y', 'out', 'y@GRAD', 'x@GRAD', 'out@GRAD']
+            ground_truth = {name: var_dict[name] for name in var_names}
+
+            program = fluid.Program()
+            with fluid.program_guard(program):
+                block = program.global_block()
+                for name in ground_truth:
+                    block.create_var(
+                        name=name,
+                        dtype='float32',
+                        shape=ground_truth[name].shape)
+                elementwise_add_op = block.append_op(
+                    type="elementwise_add",
+                    inputs={
+                        "X": block.var('x'),
+                        "Y": block.var('y'),
+                    },
+                    outputs={"Out": block.var('out'), },
+                    attrs={"axis": self.axis, })
+
+                # generate backward op_desc
+                grad_op_desc_list, op_grad_to_var = core.get_grad_op_desc(
+                    elementwise_add_op.desc, set(), [])
+                grad_op_desc = grad_op_desc_list[0]
+                new_op_desc = block.desc.append_op()
+                new_op_desc.copy_from(grad_op_desc)
+                for var_name in grad_op_desc.output_arg_names():
+                    block.desc.var(var_name.encode("ascii"))
+                grad_op_desc.infer_var_type(block.desc)
+                grad_op_desc.infer_shape(block.desc)
+                for arg in grad_op_desc.output_arg_names():
+                    grad_var = block.desc.find_var(arg.encode("ascii"))
+                    grad_var.set_dtype(core.VarDesc.VarType.FP32)
+
+                exe = fluid.Executor(place)
+                out = exe.run(program,
+                              feed={
+                                  name: var_dict[name]
+                                  for name in ['x', 'y', 'out@GRAD']
+                              },
+                              fetch_list=['x@GRAD', 'y@GRAD'])
+                self.__assert_close(x_grad, out[0], "x@GRAD")
+                self.__assert_close(y_grad, out[1], "y@GRAD", atol=1.4)
+
+        places = [core.CPUPlace()]
+        if core.is_compiled_with_cuda() and core.op_support_gpu(
+                "elementwise_add"):
+            places.append(core.CUDAPlace(0))
+
+        for place in places:
+            test_with_place(place)
+
+    def test_check_forward_backward_with_scale_and_bias(self):
+        np.random.seed(123)
+        self.x = np.random.random((4, 32, 220, 220)).astype(np.float32)
+        self.y = np.random.random((32)).astype(np.float32)
+        self.out = self.x + self.y.reshape(1, 32, 1, 1)
+        self.axis = 1
+        self.check_forward_backward()
+
+
+if __name__ == '__main__':
+    unittest.main()