未验证 提交 ef96ffb6 编写于 作者: L Li Min 提交者: GitHub

[Fix bug] fix fp16 atomicAdd compiler error on different cuda_arch. (#39886)

* Fix compile error on cuda_arch less than 700.
上级 04d324b2
......@@ -210,6 +210,12 @@ template <typename T, typename std::enable_if<std::is_same<
platform::float16, T>::value>::type * = nullptr>
__device__ __forceinline__ void VectorizedAtomicAddPerBlock(
const int64_t len, int tid, int threads_per_block, const T *in, T *out) {
#if ((CUDA_VERSION < 10000) || \
(defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700)))
for (int i = tid; i < len; i += threads_per_block) {
CudaAtomicAdd(&out[i], in[i]);
}
#else
int i = 0;
int loops = len / 2 * 2;
......@@ -233,6 +239,7 @@ __device__ __forceinline__ void VectorizedAtomicAddPerBlock(
fastAtomicAdd(out, i, len, in[i]);
}
}
#endif
}
#endif
#endif
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册