Fix the bug in fp16 backward kernel (#16266)

test=release/1.3

Fix the bug in fp16 backward kernel (#16266)
test=release/1.3
e61d7245 · Yibing Liu · GitHub · c56d9026 · e61d7245 · e61d7245
隐藏空白更改
内联并排

Showing with 33 addition and 7 deletion

paddle/fluid/operators/slice_op.cu paddle/fluid/operators/slice_op.cu +7 -7

python/paddle/fluid/tests/unittests/test_slice_op.py python/paddle/fluid/tests/unittests/test_slice_op.py +26 -0

未找到文件。
--- a/paddle/fluid/operators/slice_op.cu
+++ b/paddle/fluid/operators/slice_op.cu
@@ -31,18 +31,18 @@ __global__ void Padding(const paddle::platform::float16* d_out,
                        paddle::platform::float16* d_in) {
  int64_t out_idx = threadIdx.x + blockDim.x * blockIdx.x;
  if (out_idx < n) {
+    int64_t out_idx_tmp = out_idx;
    int coords[D] = {0};
    for (int i = D - 1; i >= 0; --i) {
-      coords[i] = out_idx % out_dims[i];
-      out_idx /= out_dims[i];
+      coords[i] = out_idx_tmp % out_dims[i];
+      out_idx_tmp /= out_dims[i];
      coords[i] += offsets[i];
    }

    int64_t in_idx = 0;
-    for (int i = 0; i < D - 1; ++i) {
-      in_idx += coords[i] * in_dims[i + 1];
+    for (int i = 0; i < D; ++i) {
+      in_idx = in_idx * in_dims[i] + coords[i];
    }
-    in_idx += coords[D - 1];

    d_in[in_idx] = d_out[out_idx];
  }
@@ -80,8 +80,8 @@ class SliceGradKernel<paddle::platform::CUDADeviceContext,
    set_zero(dev_ctx, d_in, static_cast<paddle::platform::float16>(0));

    int64_t numel = d_out->numel();
-    dim3 blocks((numel - 1) / PADDLE_CUDA_NUM_THREADS + 1, 1, 1);
-    dim3 threads(PADDLE_CUDA_NUM_THREADS, 1, 1);
+    dim3 blocks((numel - 1) / PADDLE_CUDA_NUM_THREADS + 1);
+    dim3 threads(PADDLE_CUDA_NUM_THREADS);
    auto stream = ctx.cuda_device_context().stream();

    auto out_shape = framework::vectorize2int(out_dims);

--- a/python/paddle/fluid/tests/unittests/test_slice_op.py
+++ b/python/paddle/fluid/tests/unittests/test_slice_op.py
@@ -87,5 +87,31 @@ class TestFP16(TestSliceOp):
                place, ['Input'], 'Out', max_relative_error=0.006)


+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFP16_2(TestSliceOp):
+    def config(self):
+        self.dtype = "float16"
+        self.input = np.random.random([3, 4, 5]).astype(self.dtype)
+        self.starts = [0]
+        self.ends = [1]
+        self.axes = [1]
+        self.out = self.input[:, 0:1, :]
+
+    def test_check_output(self):
+        place = core.CUDAPlace(0)
+        if core.is_float16_supported(place):
+            self.check_output_with_place(place, atol=1e-5)
+
+    def test_check_grad_normal(self):
+        place = core.CUDAPlace(0)
+        if core.is_float16_supported(place):
+            self.check_grad_with_place(
+                place, ['Input'],
+                'Out',
+                max_relative_error=0.006,
+                numeric_grad_delta=0.5)
+
+
 if __name__ == '__main__':
    unittest.main()