未验证 提交 8d87b3bc 编写于 作者: Z Zhang Zheng 提交者: GitHub

Modify the implementation of BlockYReduce to fit more scenes (#39170)

上级 b007a031
......@@ -118,7 +118,7 @@ __device__ __forceinline__ T BlockXReduce(T val, ReduceOp reducer) {
*/
template <typename T, typename ReduceOp>
__device__ __forceinline__ T BlockYReduce(T val, ReduceOp reducer) {
__shared__ T shared_memory[details::kReduceMaxThread];
__shared__ T shared_memory[1024];
shared_memory[SharedMemoryIndex(0)] = val;
for (int stride = blockDim.y / 2; stride > 0; stride >>= 1) {
__syncthreads();
......@@ -128,7 +128,8 @@ __device__ __forceinline__ T BlockYReduce(T val, ReduceOp reducer) {
}
shared_memory[SharedMemoryIndex(0)] = val;
}
return val;
__syncthreads();
return shared_memory[threadIdx.x];
}
} // namespace details
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册