未验证 提交 8d87b3bc 编写于 作者: Z Zhang Zheng 提交者: GitHub

Modify the implementation of BlockYReduce to fit more scenes (#39170)

上级 b007a031
...@@ -118,7 +118,7 @@ __device__ __forceinline__ T BlockXReduce(T val, ReduceOp reducer) { ...@@ -118,7 +118,7 @@ __device__ __forceinline__ T BlockXReduce(T val, ReduceOp reducer) {
*/ */
template <typename T, typename ReduceOp> template <typename T, typename ReduceOp>
__device__ __forceinline__ T BlockYReduce(T val, ReduceOp reducer) { __device__ __forceinline__ T BlockYReduce(T val, ReduceOp reducer) {
__shared__ T shared_memory[details::kReduceMaxThread]; __shared__ T shared_memory[1024];
shared_memory[SharedMemoryIndex(0)] = val; shared_memory[SharedMemoryIndex(0)] = val;
for (int stride = blockDim.y / 2; stride > 0; stride >>= 1) { for (int stride = blockDim.y / 2; stride > 0; stride >>= 1) {
__syncthreads(); __syncthreads();
...@@ -128,7 +128,8 @@ __device__ __forceinline__ T BlockYReduce(T val, ReduceOp reducer) { ...@@ -128,7 +128,8 @@ __device__ __forceinline__ T BlockYReduce(T val, ReduceOp reducer) {
} }
shared_memory[SharedMemoryIndex(0)] = val; shared_memory[SharedMemoryIndex(0)] = val;
} }
return val; __syncthreads();
return shared_memory[threadIdx.x];
} }
} // namespace details } // namespace details
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册