Refine CUDA atomicAdd for FP16 by CUDA primitive methods (#37895)

* fix cuda atomicAdd for FP16 * try to fix ci

Refine CUDA atomicAdd for FP16 by CUDA primitive methods (#37895)
* fix cuda atomicAdd for FP16 * try to fix ci
033ebe7e · sneaxiy · GitHub · 491d4f01 · 033ebe7e
隐藏空白更改
内联并排

Showing with 15 addition and 0 deletion

paddle/fluid/platform/device/gpu/gpu_primitives.h paddle/fluid/platform/device/gpu/gpu_primitives.h +15 -0

未找到文件。
--- a/paddle/fluid/platform/device/gpu/gpu_primitives.h
+++ b/paddle/fluid/platform/device/gpu/gpu_primitives.h
@@ -101,6 +101,20 @@ inline static __device__ uint32_t add_to_high_half(uint32_t val, float x) {
  return (val & 0xFFFFu) | (static_cast<uint32_t>(high_half.x) << 16);
 }

+#if CUDA_VERSION >= 10000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+static __device__ __forceinline__ float16 CUDAFP16ToPDFP16(__half x) {
+  return *reinterpret_cast<float16 *>(&x);
+}
+
+static __device__ __forceinline__ __half PDFP16ToCUDAFP16(float16 x) {
+  return *reinterpret_cast<__half *>(&x);
+}
+
+CUDA_ATOMIC_WRAPPER(Add, float16) {
+  return CUDAFP16ToPDFP16(
+      atomicAdd(reinterpret_cast<__half *>(address), PDFP16ToCUDAFP16(val)));
+}
+#else
 CUDA_ATOMIC_WRAPPER(Add, float16) {
  // concrete packed float16 value may exsits in lower or higher 16bits
  // of the 32bits address.
@@ -133,6 +147,7 @@ CUDA_ATOMIC_WRAPPER(Add, float16) {
  }
 }
 #endif
+#endif

 CUDA_ATOMIC_WRAPPER(Add, complex<float>) {
  float *real = reinterpret_cast<float *>(address);