Merge pull request #10366 from chengduoZH/feature/fix_shlf_for_cuda9.0

Fix __shfl and __shfl_down for CUDA9.0

Merge pull request #10366 from chengduoZH/feature/fix_shlf_for_cuda9.0
Fix __shfl and __shfl_down for CUDA9.0
8a071ffb · chengduo · GitHub · e3b8db0b · e97c1a8c · 8a071ffb
隐藏空白更改
内联并排

Showing with 10 addition and 0 deletion

paddle/fluid/platform/cuda_device_function.h paddle/fluid/platform/cuda_device_function.h +10 -0

未找到文件。
--- a/paddle/fluid/platform/cuda_device_function.h
+++ b/paddle/fluid/platform/cuda_device_function.h
@@ -35,6 +35,16 @@ __forceinline__ __device__ T __shfl_sync(unsigned, T val, int src_line,
 #define FULL_WARP_MASK 0xFFFFFFFF
 #define CREATE_SHFL_MASK(mask, predicate) \
  mask = __ballot_sync(FULL_WARP_MASK, (predicate))
+template <typename T>
+__forceinline__ __device__ T __shfl_down_sync(unsigned mask, T val, int delta) {
+  return __shfl_down_sync(mask, val, delta);
+}
+
+template <typename T>
+__forceinline__ __device__ T __shfl_sync(unsigned mask, T val, int src_line,
+                                         int width) {
+  return __shfl_sync(mask, val, src_line, width);
+}
 #endif

 template <typename T>