diff --git a/paddle/phi/kernels/primitive/compute_primitives.h b/paddle/phi/kernels/primitive/compute_primitives.h index 2265077d51bb8bf5287929d4f1864e1961519a0e..b3da41976624bcbed51c547e582c8c42acb31499 100644 --- a/paddle/phi/kernels/primitive/compute_primitives.h +++ b/paddle/phi/kernels/primitive/compute_primitives.h @@ -91,10 +91,13 @@ __device__ __forceinline__ T BlockXReduce(T val, ReduceOp reducer) { __shared__ T shared[2 * kWarpSize]; int block_dim_x = blockDim.x; if (blockDim.x > kWarpSize) { - block_dim_x = blockDim.x / kWarpSize; - int lane = threadIdx.x % kWarpSize; + // Bit operation can be used when kWarpSize is 32 or 64 now + constexpr int rshift_val = + (kWarpSize != 32) ? ((kWarpSize == 64) ? 6 : 5) : 5; + block_dim_x = blockDim.x >> rshift_val; + int lane = threadIdx.x & (kWarpSize - 1); int tid = threadIdx.y * blockDim.x + threadIdx.x; - int wid = tid / kWarpSize; + int wid = tid >> rshift_val; int bid = threadIdx.y; val = WarpReduce(val, reducer); if (lane == 0) { @@ -110,6 +113,7 @@ __device__ __forceinline__ T BlockXReduce(T val, ReduceOp reducer) { T temp = paddle::platform::CudaShuffleDownSync(mask, val, stride); val = reducer(val, temp); } + __syncthreads(); if (threadIdx.x == 0) { shared[threadIdx.y] = val; } @@ -385,8 +389,8 @@ __device__ __forceinline__ void CycleBinary(OutT* out, /** * @brief The Reduce provides collective methods for computing a parallel * reduction of items partitioned across a CUDA block and intra thread. When - * ReduceMode == kLocalMode, thread reduce along nx. When ReduceMode == - * kGlobalMode, use shared memory to reduce between threads. + * ReduceMode == kLocalMode, use shared memory to reduce between threads.When + * ReduceMode == kGlobalMode, thread reduce along nx. * * @template paraments * T: The type of data.