未验证 提交 7e20e769 编写于 作者: Y Yibing Liu 提交者: GitHub

Fix the bug in fp16 backward kernel (#16269)

test=develop
上级 af030088
...@@ -31,18 +31,18 @@ __global__ void Padding(const paddle::platform::float16* d_out, ...@@ -31,18 +31,18 @@ __global__ void Padding(const paddle::platform::float16* d_out,
paddle::platform::float16* d_in) { paddle::platform::float16* d_in) {
int64_t out_idx = threadIdx.x + blockDim.x * blockIdx.x; int64_t out_idx = threadIdx.x + blockDim.x * blockIdx.x;
if (out_idx < n) { if (out_idx < n) {
int64_t out_idx_tmp = out_idx;
int coords[D] = {0}; int coords[D] = {0};
for (int i = D - 1; i >= 0; --i) { for (int i = D - 1; i >= 0; --i) {
coords[i] = out_idx % out_dims[i]; coords[i] = out_idx_tmp % out_dims[i];
out_idx /= out_dims[i]; out_idx_tmp /= out_dims[i];
coords[i] += offsets[i]; coords[i] += offsets[i];
} }
int64_t in_idx = 0; int64_t in_idx = 0;
for (int i = 0; i < D - 1; ++i) { for (int i = 0; i < D; ++i) {
in_idx += coords[i] * in_dims[i + 1]; in_idx = in_idx * in_dims[i] + coords[i];
} }
in_idx += coords[D - 1];
d_in[in_idx] = d_out[out_idx]; d_in[in_idx] = d_out[out_idx];
} }
...@@ -80,8 +80,8 @@ class SliceGradKernel<paddle::platform::CUDADeviceContext, ...@@ -80,8 +80,8 @@ class SliceGradKernel<paddle::platform::CUDADeviceContext,
set_zero(dev_ctx, d_in, static_cast<paddle::platform::float16>(0)); set_zero(dev_ctx, d_in, static_cast<paddle::platform::float16>(0));
int64_t numel = d_out->numel(); int64_t numel = d_out->numel();
dim3 blocks((numel - 1) / PADDLE_CUDA_NUM_THREADS + 1, 1, 1); dim3 blocks((numel - 1) / PADDLE_CUDA_NUM_THREADS + 1);
dim3 threads(PADDLE_CUDA_NUM_THREADS, 1, 1); dim3 threads(PADDLE_CUDA_NUM_THREADS);
auto stream = ctx.cuda_device_context().stream(); auto stream = ctx.cuda_device_context().stream();
auto out_shape = framework::vectorize2int(out_dims); auto out_shape = framework::vectorize2int(out_dims);
......
...@@ -87,5 +87,31 @@ class TestFP16(TestSliceOp): ...@@ -87,5 +87,31 @@ class TestFP16(TestSliceOp):
place, ['Input'], 'Out', max_relative_error=0.006) place, ['Input'], 'Out', max_relative_error=0.006)
@unittest.skipIf(not core.is_compiled_with_cuda(),
"core is not compiled with CUDA")
class TestFP16_2(TestSliceOp):
def config(self):
self.dtype = "float16"
self.input = np.random.random([3, 4, 5]).astype(self.dtype)
self.starts = [0]
self.ends = [1]
self.axes = [1]
self.out = self.input[:, 0:1, :]
def test_check_output(self):
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=1e-5)
def test_check_grad_normal(self):
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_grad_with_place(
place, ['Input'],
'Out',
max_relative_error=0.006,
numeric_grad_delta=0.5)
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册