From 9171f73714f86f3675348b8140a3863a9fa38d94 Mon Sep 17 00:00:00 2001 From: danleifeng <52735331+danleifeng@users.noreply.github.com> Date: Thu, 24 Oct 2019 19:50:05 +0800 Subject: [PATCH] fix fp16 grid_size for size=1; test=develop (#20812) --- .../fluid/operators/elementwise/elementwise_add_op.cu | 11 ++++++----- .../fluid/operators/elementwise/elementwise_div_op.cu | 11 ++++++----- .../fluid/operators/elementwise/elementwise_mul_op.cu | 11 ++++++----- .../fluid/operators/elementwise/elementwise_sub_op.cu | 11 ++++++----- 4 files changed, 24 insertions(+), 20 deletions(-) diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.cu b/paddle/fluid/operators/elementwise/elementwise_add_op.cu index de121b3786f..71019872802 100644 --- a/paddle/fluid/operators/elementwise/elementwise_add_op.cu +++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cu @@ -40,8 +40,9 @@ struct SameDimsElemwiseAdd { const framework::Tensor* x, const framework::Tensor* y, framework::Tensor* z) { auto size = x->numel(); - dim3 gird_size = dim3( - (size / 2 + PADDLE_CUDA_THREAD_SIZE - 1) / PADDLE_CUDA_THREAD_SIZE, 1); + dim3 grid_size = dim3(((size + 1) / 2 + PADDLE_CUDA_THREAD_SIZE - 1) / + PADDLE_CUDA_THREAD_SIZE, + 1); dim3 block_size = dim3(PADDLE_CUDA_THREAD_SIZE, 1); const half* x2 = reinterpret_cast(x->data()); @@ -49,7 +50,7 @@ struct SameDimsElemwiseAdd { reinterpret_cast(y->data()); half* z2 = reinterpret_cast(z->data()); SameDimsElemwiseAddCUDAKernel<<< - gird_size, block_size, 0, + grid_size, block_size, 0, ctx.template device_context().stream()>>>( x2, y2, z2, size); } @@ -78,10 +79,10 @@ elementwise_add_grad(const framework::ExecutionContext& ctx, framework::Tensor* dy) { dim3 block_size = dim3(PADDLE_CUDA_THREAD_SIZE, 1); auto size = x->numel(); - dim3 gird_size = + dim3 grid_size = dim3((size + PADDLE_CUDA_THREAD_SIZE - 1) / PADDLE_CUDA_THREAD_SIZE, 1); SimpleElemwiseAddGradCUDAKernel< - T><<<<().stream()>>>( dout->data(), size, dx->mutable_data(ctx.GetPlace()), dy->mutable_data(ctx.GetPlace())); diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.cu b/paddle/fluid/operators/elementwise/elementwise_div_op.cu index b1698491180..e31722a2881 100644 --- a/paddle/fluid/operators/elementwise/elementwise_div_op.cu +++ b/paddle/fluid/operators/elementwise/elementwise_div_op.cu @@ -41,8 +41,9 @@ struct SameDimsElemwiseDiv { const framework::Tensor* x, const framework::Tensor* y, framework::Tensor* z) { auto size = x->numel(); - dim3 gird_size = dim3( - (size / 2 + PADDLE_CUDA_THREAD_SIZE - 1) / PADDLE_CUDA_THREAD_SIZE, 1); + dim3 grid_size = dim3(((size + 1) / 2 + PADDLE_CUDA_THREAD_SIZE - 1) / + PADDLE_CUDA_THREAD_SIZE, + 1); dim3 block_size = dim3(PADDLE_CUDA_THREAD_SIZE, 1); const half* x2 = reinterpret_cast(x->data()); @@ -50,7 +51,7 @@ struct SameDimsElemwiseDiv { reinterpret_cast(y->data()); half* z2 = reinterpret_cast(z->data()); SameDimsElemwiseDivCUDAKernel<<< - gird_size, block_size, 0, + grid_size, block_size, 0, ctx.template device_context().stream()>>>( x2, y2, z2, size); } @@ -82,10 +83,10 @@ elementwise_div_grad(const framework::ExecutionContext& ctx, framework::Tensor* dy) { dim3 block_size = dim3(PADDLE_CUDA_THREAD_SIZE, 1); auto size = x->numel(); - dim3 gird_size = + dim3 grid_size = dim3((size + PADDLE_CUDA_THREAD_SIZE - 1) / PADDLE_CUDA_THREAD_SIZE, 1); SimpleElemwiseDivGradCUDAKernel< - T><<<<().stream()>>>( x->data(), y->data(), out->data(), dout->data(), size, dx->mutable_data(ctx.GetPlace()), dy->mutable_data(ctx.GetPlace())); diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu index 4814cb144f0..8533189f81a 100644 --- a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu +++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu @@ -41,8 +41,9 @@ struct SameDimsElemwiseMul { const framework::Tensor* x, const framework::Tensor* y, framework::Tensor* z) { auto size = x->numel(); - dim3 gird_size = dim3( - (size / 2 + PADDLE_CUDA_THREAD_SIZE - 1) / PADDLE_CUDA_THREAD_SIZE, 1); + dim3 grid_size = dim3(((size + 1) / 2 + PADDLE_CUDA_THREAD_SIZE - 1) / + PADDLE_CUDA_THREAD_SIZE, + 1); dim3 block_size = dim3(PADDLE_CUDA_THREAD_SIZE, 1); const half* x2 = reinterpret_cast(x->data()); @@ -50,7 +51,7 @@ struct SameDimsElemwiseMul { reinterpret_cast(y->data()); half* z2 = reinterpret_cast(z->data()); SameDimsElemwiseMulCUDAKernel<<< - gird_size, block_size, 0, + grid_size, block_size, 0, ctx.template device_context().stream()>>>( x2, y2, z2, size); } @@ -82,10 +83,10 @@ elementwise_mul_grad(const framework::ExecutionContext& ctx, framework::Tensor* dy) { dim3 block_size = dim3(PADDLE_CUDA_THREAD_SIZE, 1); auto size = x->numel(); - dim3 gird_size = + dim3 grid_size = dim3((size + PADDLE_CUDA_THREAD_SIZE - 1) / PADDLE_CUDA_THREAD_SIZE, 1); SimpleElemwiseMulGradCUDAKernel< - T><<<<().stream()>>>( x->data(), y->data(), out->data(), dout->data(), size, dx->mutable_data(ctx.GetPlace()), dy->mutable_data(ctx.GetPlace())); diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.cu b/paddle/fluid/operators/elementwise/elementwise_sub_op.cu index 7ff72028091..9913927ee3c 100644 --- a/paddle/fluid/operators/elementwise/elementwise_sub_op.cu +++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.cu @@ -41,8 +41,9 @@ struct SameDimsElemwiseSub { const framework::Tensor* x, const framework::Tensor* y, framework::Tensor* z) { auto size = x->numel(); - dim3 gird_size = dim3( - (size / 2 + PADDLE_CUDA_THREAD_SIZE - 1) / PADDLE_CUDA_THREAD_SIZE, 1); + dim3 grid_size = dim3(((size + 1) / 2 + PADDLE_CUDA_THREAD_SIZE - 1) / + PADDLE_CUDA_THREAD_SIZE, + 1); dim3 block_size = dim3(PADDLE_CUDA_THREAD_SIZE, 1); const half* x2 = reinterpret_cast(x->data()); @@ -50,7 +51,7 @@ struct SameDimsElemwiseSub { reinterpret_cast(y->data()); half* z2 = reinterpret_cast(z->data()); SameDimsElemwiseSubCUDAKernel<<< - gird_size, block_size, 0, + grid_size, block_size, 0, ctx.template device_context().stream()>>>( x2, y2, z2, size); } @@ -79,10 +80,10 @@ elementwise_sub_grad(const framework::ExecutionContext& ctx, framework::Tensor* dy) { dim3 block_size = dim3(PADDLE_CUDA_THREAD_SIZE, 1); auto size = x->numel(); - dim3 gird_size = + dim3 grid_size = dim3((size + PADDLE_CUDA_THREAD_SIZE - 1) / PADDLE_CUDA_THREAD_SIZE, 1); SimpleElemwiseSubGradCUDAKernel< - T><<<<().stream()>>>( dout->data(), size, dx->mutable_data(ctx.GetPlace()), dy->mutable_data(ctx.GetPlace())); -- GitLab