From 8d87b3bcc7015569887ab40276c6005d9bef88a8 Mon Sep 17 00:00:00 2001 From: Zhang Zheng <32410583+ZzSean@users.noreply.github.com> Date: Wed, 9 Feb 2022 10:58:42 +0800 Subject: [PATCH] Modify the implementation of BlockYReduce to fit more scenes (#39170) --- paddle/pten/kernels/primitive/compute_primitives.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/paddle/pten/kernels/primitive/compute_primitives.h b/paddle/pten/kernels/primitive/compute_primitives.h index 449c81b915e..a8ed0816227 100644 --- a/paddle/pten/kernels/primitive/compute_primitives.h +++ b/paddle/pten/kernels/primitive/compute_primitives.h @@ -118,7 +118,7 @@ __device__ __forceinline__ T BlockXReduce(T val, ReduceOp reducer) { */ template __device__ __forceinline__ T BlockYReduce(T val, ReduceOp reducer) { - __shared__ T shared_memory[details::kReduceMaxThread]; + __shared__ T shared_memory[1024]; shared_memory[SharedMemoryIndex(0)] = val; for (int stride = blockDim.y / 2; stride > 0; stride >>= 1) { __syncthreads(); @@ -128,7 +128,8 @@ __device__ __forceinline__ T BlockYReduce(T val, ReduceOp reducer) { } shared_memory[SharedMemoryIndex(0)] = val; } - return val; + __syncthreads(); + return shared_memory[threadIdx.x]; } } // namespace details -- GitLab