fix int32 overflow in cuda kernel loop (#38007)

37f43ebc · Leo Chen · GitHub · dabf8152 · 37f43ebc
隐藏空白更改
内联并排

Showing with 3 addition and 6 deletion

paddle/fluid/operators/label_smooth_op.cu paddle/fluid/operators/label_smooth_op.cu +3 -6

未找到文件。
--- a/paddle/fluid/operators/label_smooth_op.cu
+++ b/paddle/fluid/operators/label_smooth_op.cu
@@ -21,8 +21,7 @@ template <typename T>
 __global__ void LabelSmoothRunOriginKernel(const int N, const float epsilon,
                                           const int label_dim, const T* src,
                                           T* dst) {
-  int idx = blockDim.x * blockIdx.x + threadIdx.x;
+  CUDA_KERNEL_LOOP(idx, N) {
-  for (; idx < N; idx += blockDim.x * gridDim.x) {
    dst[idx] = static_cast<T>(1 - epsilon) * src[idx] +
               static_cast<T>(epsilon / label_dim);
  }
@@ -32,8 +31,7 @@ template <typename T>
 __global__ void LabelSmoothRunDistKernel(const int N, const float epsilon,
                                         const int dist_numel, const T* src,
                                         const T* dist_data, T* dst) {
-  int idx = blockDim.x * blockIdx.x + threadIdx.x;
+  CUDA_KERNEL_LOOP(idx, N) {
-  for (; idx < N; idx += blockDim.x * gridDim.x) {
    int dist_idx = idx % dist_numel;
    dst[idx] = static_cast<T>(1 - epsilon) * src[idx] +
               static_cast<T>(epsilon) * dist_data[dist_idx];
@@ -43,8 +41,7 @@ __global__ void LabelSmoothRunDistKernel(const int N, const float epsilon,
 template <typename T>
 __global__ void LabelSmoothGradRunKernel(const int N, const float epsilon,
                                         const T* src, T* dst) {
-  int idx = blockDim.x * blockIdx.x + threadIdx.x;
+  CUDA_KERNEL_LOOP(idx, N) {
-  for (; idx < N; idx += blockDim.x * gridDim.x) {
    dst[idx] = static_cast<T>(1 - epsilon) * src[idx];
  }
 }