Add todo for reduceSum

90dc33b5 · chengduoZH · b8938b44 · 90dc33b5
隐藏空白更改
内联并排

Showing with 6 addition and 0 deletion

paddle/fluid/platform/cuda_helper.h paddle/fluid/platform/cuda_helper.h +6 -0

未找到文件。
--- a/paddle/fluid/platform/cuda_helper.h
+++ b/paddle/fluid/platform/cuda_helper.h
@@ -77,6 +77,12 @@ __forceinline__ __device__ T __shfl_down_sync(unsigned, T val, int delta) {
 template <typename T>
 __device__ T reduceSum(T val, int tid, int len) {
+  // TODO(zcd): The warp size should be taken from the
+  // parameters of the GPU but not specified as 32 simply.
+  // To make the reduceSum more efficiently,
+  // I use Warp-Level Parallelism and assume the Warp size
+  // is 32 which may be different for different GPU,
+  // but most card's warp size is 32.
  __shared__ T shm[32];
  const int warpSize = 32;
  unsigned mask = 0u;