From 77dbb318e77146aa043be3f276959e0c42d3911c Mon Sep 17 00:00:00 2001
From: Bo Zhang <105368690+zhangbopd@users.noreply.github.com>
Date: Thu, 27 Oct 2022 13:38:06 +0800
Subject: [PATCH] fix reduce_any kernel data race on sharedMem (#47233)

* fix reduce_any kernel data race on sharedMem

* use bit operation instead of div & mod

* unbranch

* modified according to PR comments
---
 paddle/phi/kernels/primitive/compute_primitives.h | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/paddle/phi/kernels/primitive/compute_primitives.h b/paddle/phi/kernels/primitive/compute_primitives.h
index 2265077d51b..b3da4197662 100644
--- a/paddle/phi/kernels/primitive/compute_primitives.h
+++ b/paddle/phi/kernels/primitive/compute_primitives.h
@@ -91,10 +91,13 @@ __device__ __forceinline__ T BlockXReduce(T val, ReduceOp reducer) {
   __shared__ T shared[2 * kWarpSize];
   int block_dim_x = blockDim.x;
   if (blockDim.x > kWarpSize) {
-    block_dim_x = blockDim.x / kWarpSize;
-    int lane = threadIdx.x % kWarpSize;
+    // Bit operation can be used when kWarpSize is 32 or 64 now
+    constexpr int rshift_val =
+        (kWarpSize != 32) ? ((kWarpSize == 64) ? 6 : 5) : 5;
+    block_dim_x = blockDim.x >> rshift_val;
+    int lane = threadIdx.x & (kWarpSize - 1);
     int tid = threadIdx.y * blockDim.x + threadIdx.x;
-    int wid = tid / kWarpSize;
+    int wid = tid >> rshift_val;
     int bid = threadIdx.y;
     val = WarpReduce(val, reducer);
     if (lane == 0) {
@@ -110,6 +113,7 @@ __device__ __forceinline__ T BlockXReduce(T val, ReduceOp reducer) {
     T temp = paddle::platform::CudaShuffleDownSync(mask, val, stride);
     val = reducer(val, temp);
   }
+  __syncthreads();
   if (threadIdx.x == 0) {
     shared[threadIdx.y] = val;
   }
@@ -385,8 +389,8 @@ __device__ __forceinline__ void CycleBinary(OutT* out,
 /**
  * @brief The Reduce provides collective methods for computing a parallel
  * reduction of items partitioned across a CUDA block and intra thread. When
- * ReduceMode == kLocalMode, thread reduce along nx. When ReduceMode ==
- * kGlobalMode, use shared memory to reduce between threads.
+ * ReduceMode == kLocalMode, use shared memory to reduce between threads.When
+ * ReduceMode == kGlobalMode, thread reduce along nx.
  *
  * @template paraments
  * T: The type of data.
-- 
GitLab