diff --git a/paddle/fluid/operators/elementwise_op_function.h b/paddle/fluid/operators/elementwise_op_function.h index 415182201a7a9e11d8ea8c62b92849b5ea3bac3e..f0362ec606c994d69f31c7a2e1e9ad0d0108b621 100644 --- a/paddle/fluid/operators/elementwise_op_function.h +++ b/paddle/fluid/operators/elementwise_op_function.h @@ -356,8 +356,8 @@ __device__ T reduceSum(T val, int tid, int len) { // I use Warp-Level Parallelism and assume the Warp size // is 32 which may be different for different GPU, // but most card's warp size is 32. - __shared__ T shm[32]; const int warpSize = 32; + __shared__ T shm[warpSize]; unsigned mask = 0u; CREATE_SHFL_MASK(mask, tid < len); @@ -371,6 +371,7 @@ __device__ T reduceSum(T val, int tid, int len) { if (tid % warpSize == 0) { shm[tid / warpSize] = val; } + __syncthreads(); CREATE_SHFL_MASK(mask, tid < warpSize); diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_gradient_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_gradient_op.py new file mode 100644 index 0000000000000000000000000000000000000000..c6f45381af8ac64d117eb27325f25763fbf6cae7 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_elementwise_gradient_op.py @@ -0,0 +1,103 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import unittest +import numpy as np + +import paddle.fluid.core as core +import paddle.fluid as fluid + + +class TestElementWiseAddOp(unittest.TestCase): + def __assert_close(self, tensor, np_array, msg, atol=1e-4): + self.assertTrue(np.allclose(np.array(tensor), np_array, atol=atol), msg) + + def check_forward_backward(self): + def test_with_place(place): + out_grad = np.random.random_sample(self.x.shape).astype(np.float32) + x_grad = out_grad + sum_axis = range(0, len(self.x.shape)) + del sum_axis[self.axis] + y_grad = np.sum(out_grad, axis=tuple(sum_axis)) + + var_dict = locals() + var_dict['y'] = self.y + var_dict['x'] = self.x + var_dict['out'] = self.out + var_dict['y@GRAD'] = y_grad + var_dict['x@GRAD'] = x_grad + var_dict['out@GRAD'] = out_grad + + var_names = ['x', 'y', 'out', 'y@GRAD', 'x@GRAD', 'out@GRAD'] + ground_truth = {name: var_dict[name] for name in var_names} + + program = fluid.Program() + with fluid.program_guard(program): + block = program.global_block() + for name in ground_truth: + block.create_var( + name=name, + dtype='float32', + shape=ground_truth[name].shape) + elementwise_add_op = block.append_op( + type="elementwise_add", + inputs={ + "X": block.var('x'), + "Y": block.var('y'), + }, + outputs={"Out": block.var('out'), }, + attrs={"axis": self.axis, }) + + # generate backward op_desc + grad_op_desc_list, op_grad_to_var = core.get_grad_op_desc( + elementwise_add_op.desc, set(), []) + grad_op_desc = grad_op_desc_list[0] + new_op_desc = block.desc.append_op() + new_op_desc.copy_from(grad_op_desc) + for var_name in grad_op_desc.output_arg_names(): + block.desc.var(var_name.encode("ascii")) + grad_op_desc.infer_var_type(block.desc) + grad_op_desc.infer_shape(block.desc) + for arg in grad_op_desc.output_arg_names(): + grad_var = block.desc.find_var(arg.encode("ascii")) + grad_var.set_dtype(core.VarDesc.VarType.FP32) + + exe = fluid.Executor(place) + out = exe.run(program, + feed={ + name: var_dict[name] + for name in ['x', 'y', 'out@GRAD'] + }, + fetch_list=['x@GRAD', 'y@GRAD']) + self.__assert_close(x_grad, out[0], "x@GRAD") + self.__assert_close(y_grad, out[1], "y@GRAD", atol=1.4) + + places = [core.CPUPlace()] + if core.is_compiled_with_cuda() and core.op_support_gpu( + "elementwise_add"): + places.append(core.CUDAPlace(0)) + + for place in places: + test_with_place(place) + + def test_check_forward_backward_with_scale_and_bias(self): + np.random.seed(123) + self.x = np.random.random((4, 32, 220, 220)).astype(np.float32) + self.y = np.random.random((32)).astype(np.float32) + self.out = self.x + self.y.reshape(1, 32, 1, 1) + self.axis = 1 + self.check_forward_backward() + + +if __name__ == '__main__': + unittest.main()