From 7e20e7691e3dda88a84d84a00be5dffcd9772ebf Mon Sep 17 00:00:00 2001 From: Yibing Liu Date: Tue, 19 Mar 2019 10:18:19 +0800 Subject: [PATCH] Fix the bug in fp16 backward kernel (#16269) test=develop --- paddle/fluid/operators/slice_op.cu | 14 +++++----- .../fluid/tests/unittests/test_slice_op.py | 26 +++++++++++++++++++ 2 files changed, 33 insertions(+), 7 deletions(-) diff --git a/paddle/fluid/operators/slice_op.cu b/paddle/fluid/operators/slice_op.cu index 1af57b89a35..24a564f9ef9 100644 --- a/paddle/fluid/operators/slice_op.cu +++ b/paddle/fluid/operators/slice_op.cu @@ -31,18 +31,18 @@ __global__ void Padding(const paddle::platform::float16* d_out, paddle::platform::float16* d_in) { int64_t out_idx = threadIdx.x + blockDim.x * blockIdx.x; if (out_idx < n) { + int64_t out_idx_tmp = out_idx; int coords[D] = {0}; for (int i = D - 1; i >= 0; --i) { - coords[i] = out_idx % out_dims[i]; - out_idx /= out_dims[i]; + coords[i] = out_idx_tmp % out_dims[i]; + out_idx_tmp /= out_dims[i]; coords[i] += offsets[i]; } int64_t in_idx = 0; - for (int i = 0; i < D - 1; ++i) { - in_idx += coords[i] * in_dims[i + 1]; + for (int i = 0; i < D; ++i) { + in_idx = in_idx * in_dims[i] + coords[i]; } - in_idx += coords[D - 1]; d_in[in_idx] = d_out[out_idx]; } @@ -80,8 +80,8 @@ class SliceGradKernel(0)); int64_t numel = d_out->numel(); - dim3 blocks((numel - 1) / PADDLE_CUDA_NUM_THREADS + 1, 1, 1); - dim3 threads(PADDLE_CUDA_NUM_THREADS, 1, 1); + dim3 blocks((numel - 1) / PADDLE_CUDA_NUM_THREADS + 1); + dim3 threads(PADDLE_CUDA_NUM_THREADS); auto stream = ctx.cuda_device_context().stream(); auto out_shape = framework::vectorize2int(out_dims); diff --git a/python/paddle/fluid/tests/unittests/test_slice_op.py b/python/paddle/fluid/tests/unittests/test_slice_op.py index 5fdabbabeda..aefd8cb6d3a 100644 --- a/python/paddle/fluid/tests/unittests/test_slice_op.py +++ b/python/paddle/fluid/tests/unittests/test_slice_op.py @@ -87,5 +87,31 @@ class TestFP16(TestSliceOp): place, ['Input'], 'Out', max_relative_error=0.006) +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestFP16_2(TestSliceOp): + def config(self): + self.dtype = "float16" + self.input = np.random.random([3, 4, 5]).astype(self.dtype) + self.starts = [0] + self.ends = [1] + self.axes = [1] + self.out = self.input[:, 0:1, :] + + def test_check_output(self): + place = core.CUDAPlace(0) + if core.is_float16_supported(place): + self.check_output_with_place(place, atol=1e-5) + + def test_check_grad_normal(self): + place = core.CUDAPlace(0) + if core.is_float16_supported(place): + self.check_grad_with_place( + place, ['Input'], + 'Out', + max_relative_error=0.006, + numeric_grad_delta=0.5) + + if __name__ == '__main__': unittest.main() -- GitLab