From 77dbb318e77146aa043be3f276959e0c42d3911c Mon Sep 17 00:00:00 2001 From: Bo Zhang <105368690+zhangbopd@users.noreply.github.com> Date: Thu, 27 Oct 2022 13:38:06 +0800 Subject: [PATCH] fix reduce_any kernel data race on sharedMem (#47233) * fix reduce_any kernel data race on sharedMem * use bit operation instead of div & mod * unbranch * modified according to PR comments --- paddle/phi/kernels/primitive/compute_primitives.h | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/paddle/phi/kernels/primitive/compute_primitives.h b/paddle/phi/kernels/primitive/compute_primitives.h index 2265077d51..b3da419766 100644 --- a/paddle/phi/kernels/primitive/compute_primitives.h +++ b/paddle/phi/kernels/primitive/compute_primitives.h @@ -91,10 +91,13 @@ __device__ __forceinline__ T BlockXReduce(T val, ReduceOp reducer) { __shared__ T shared[2 * kWarpSize]; int block_dim_x = blockDim.x; if (blockDim.x > kWarpSize) { - block_dim_x = blockDim.x / kWarpSize; - int lane = threadIdx.x % kWarpSize; + // Bit operation can be used when kWarpSize is 32 or 64 now + constexpr int rshift_val = + (kWarpSize != 32) ? ((kWarpSize == 64) ? 6 : 5) : 5; + block_dim_x = blockDim.x >> rshift_val; + int lane = threadIdx.x & (kWarpSize - 1); int tid = threadIdx.y * blockDim.x + threadIdx.x; - int wid = tid / kWarpSize; + int wid = tid >> rshift_val; int bid = threadIdx.y; val = WarpReduce(val, reducer); if (lane == 0) { @@ -110,6 +113,7 @@ __device__ __forceinline__ T BlockXReduce(T val, ReduceOp reducer) { T temp = paddle::platform::CudaShuffleDownSync(mask, val, stride); val = reducer(val, temp); } + __syncthreads(); if (threadIdx.x == 0) { shared[threadIdx.y] = val; } @@ -385,8 +389,8 @@ __device__ __forceinline__ void CycleBinary(OutT* out, /** * @brief The Reduce provides collective methods for computing a parallel * reduction of items partitioned across a CUDA block and intra thread. When - * ReduceMode == kLocalMode, thread reduce along nx. When ReduceMode == - * kGlobalMode, use shared memory to reduce between threads. + * ReduceMode == kLocalMode, use shared memory to reduce between threads.When + * ReduceMode == kGlobalMode, thread reduce along nx. * * @template paraments * T: The type of data. -- GitLab