fix fp16 grid_size for size=1; test=release/1.6 (#20806)

828fbd57 · danleifeng · gongweibao · a6fc250d · 828fbd57 · 828fbd57
4 changed file
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cu
@@ -40,8 +40,9 @@ struct SameDimsElemwiseAdd<platform::CUDADeviceContext, platform::float16> {
                  const framework::Tensor* x, const framework::Tensor* y,
                  framework::Tensor* z) {
    auto size = x->numel();
-    dim3 gird_size = dim3(
+    dim3 gird_size = dim3(((size + 1) / 2 + PADDLE_CUDA_THREAD_SIZE - 1) /
-        (size / 2 + PADDLE_CUDA_THREAD_SIZE - 1) / PADDLE_CUDA_THREAD_SIZE, 1);
+                              PADDLE_CUDA_THREAD_SIZE,
+                          1);
    dim3 block_size = dim3(PADDLE_CUDA_THREAD_SIZE, 1);
    const half* x2 =
        reinterpret_cast<const half*>(x->data<platform::float16>());

--- a/paddle/fluid/operators/elementwise/elementwise_div_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_div_op.cu
@@ -41,8 +41,9 @@ struct SameDimsElemwiseDiv<platform::CUDADeviceContext, platform::float16> {
                  const framework::Tensor* x, const framework::Tensor* y,
                  framework::Tensor* z) {
    auto size = x->numel();
-    dim3 gird_size = dim3(
+    dim3 gird_size = dim3(((size + 1) / 2 + PADDLE_CUDA_THREAD_SIZE - 1) /
-        (size / 2 + PADDLE_CUDA_THREAD_SIZE - 1) / PADDLE_CUDA_THREAD_SIZE, 1);
+                              PADDLE_CUDA_THREAD_SIZE,
+                          1);
    dim3 block_size = dim3(PADDLE_CUDA_THREAD_SIZE, 1);
    const half* x2 =
        reinterpret_cast<const half*>(x->data<platform::float16>());

--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
@@ -41,8 +41,9 @@ struct SameDimsElemwiseMul<platform::CUDADeviceContext, platform::float16> {
                  const framework::Tensor* x, const framework::Tensor* y,
                  framework::Tensor* z) {
    auto size = x->numel();
-    dim3 gird_size = dim3(
+    dim3 gird_size = dim3(((size + 1) / 2 + PADDLE_CUDA_THREAD_SIZE - 1) /
-        (size / 2 + PADDLE_CUDA_THREAD_SIZE - 1) / PADDLE_CUDA_THREAD_SIZE, 1);
+                              PADDLE_CUDA_THREAD_SIZE,
+                          1);
    dim3 block_size = dim3(PADDLE_CUDA_THREAD_SIZE, 1);
    const half* x2 =
        reinterpret_cast<const half*>(x->data<platform::float16>());

--- a/paddle/fluid/operators/elementwise/elementwise_sub_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.cu
@@ -41,8 +41,9 @@ struct SameDimsElemwiseSub<platform::CUDADeviceContext, platform::float16> {
                  const framework::Tensor* x, const framework::Tensor* y,
                  framework::Tensor* z) {
    auto size = x->numel();
-    dim3 gird_size = dim3(
+    dim3 gird_size = dim3(((size + 1) / 2 + PADDLE_CUDA_THREAD_SIZE - 1) /
-        (size / 2 + PADDLE_CUDA_THREAD_SIZE - 1) / PADDLE_CUDA_THREAD_SIZE, 1);
+                              PADDLE_CUDA_THREAD_SIZE,
+                          1);
    dim3 block_size = dim3(PADDLE_CUDA_THREAD_SIZE, 1);
    const half* x2 =
        reinterpret_cast<const half*>(x->data<platform::float16>());