diff --git a/paddle/fluid/platform/cuda_device_function.h b/paddle/fluid/platform/cuda_device_function.h index e81c385727be5c2ba3f02bfbd86168cb4650dfda..ecec4178f2d9937920e52eb74bf9068b84e741a0 100644 --- a/paddle/fluid/platform/cuda_device_function.h +++ b/paddle/fluid/platform/cuda_device_function.h @@ -63,6 +63,7 @@ __device__ T reduceSum(T val, int tid, int len) { val += platform::CudaShuffleDownSync(mask, val, offset); if (tid < warpSize) shm[tid] = 0; + __syncthreads(); if (tid % warpSize == 0) { shm[tid / warpSize] = val;