diff --git a/paddle/fluid/platform/device/gpu/cuda/cuda_helper.h b/paddle/fluid/platform/device/gpu/cuda/cuda_helper.h
index 7185d2356aae5afbd798cf083df1e4c69d69731d..d1d33d50a5dbb9b6d3059bc5c83acca049fb8a93 100644
--- a/paddle/fluid/platform/device/gpu/cuda/cuda_helper.h
+++ b/paddle/fluid/platform/device/gpu/cuda/cuda_helper.h
@@ -70,9 +70,10 @@ namespace platform {
  *
  */
 
-#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type)            \
-  int64_t __index__ = blockIdx.x * blockDim.x + threadIdx.x; \
-  for (index_type i = __index__; __index__ < (num);          \
+#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type)                  \
+  int64_t __index__ =                                              \
+      static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x; \
+  for (index_type i = __index__; __index__ < (num);                \
        __index__ += blockDim.x * gridDim.x, i = __index__)
 
 class CublasHandleHolder {
diff --git a/paddle/fluid/platform/device/gpu/gpu_launch_config.h b/paddle/fluid/platform/device/gpu/gpu_launch_config.h
index 3628b7e0418fa192db3eff322d268c51c91e3862..ca861d543f1ef16d0e83ef26bad068cea2770fde 100644
--- a/paddle/fluid/platform/device/gpu/gpu_launch_config.h
+++ b/paddle/fluid/platform/device/gpu/gpu_launch_config.h
@@ -44,7 +44,10 @@
 namespace paddle {
 namespace platform {
 
-inline int DivUp(int a, int b) { return (a + b - 1) / b; }
+template <typename T = int>
+inline T DivUp(T a, T b) {
+  return (a + b - 1) / b;
+}
 
 /* https://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
    for round integer value into next highest power of 2. */
@@ -120,7 +123,7 @@ inline GpuLaunchConfig GetGpuLaunchConfig1D(const phi::GPUContext& context,
 #endif
   int threads = limit_threads;
   int sm_count = context.GetSMCount();
-  int active_threads_num = numel / vec_size;
+  int64_t active_threads_num = numel / vec_size;
   if (active_threads_num / (sm_count << 1) < limit_threads) {
     // Round up threads number into an exponential multiple of 2, while number
     // of acitve blocks is about twice of SM, to acquire better performance.
@@ -132,7 +135,7 @@ inline GpuLaunchConfig GetGpuLaunchConfig1D(const phi::GPUContext& context,
   }
   // Number of threads per block shall be larger than 64.
   threads = std::max(64, threads);
-  int blocks = DivUp(DivUp(numel, vec_size), threads);
+  int64_t blocks = DivUp<int64_t>(DivUp<int64_t>(numel, vec_size), threads);
   int limit_blocks = context.GetCUDAMaxGridDimSize()[0];
   if (blocks > limit_blocks) {
     blocks = limit_blocks;
@@ -146,8 +149,8 @@ inline GpuLaunchConfig GetGpuLaunchConfig1D(const phi::GPUContext& context,
 }
 
 inline GpuLaunchConfig GetGpuLaunchConfig2D(const phi::GPUContext& context,
-                                            int x_dim,
-                                            int y_dim) {
+                                            int64_t x_dim,
+                                            int64_t y_dim) {
   PADDLE_ENFORCE_GT(
       x_dim,
       0,
@@ -162,8 +165,10 @@ inline GpuLaunchConfig GetGpuLaunchConfig2D(const phi::GPUContext& context,
                                         y_dim));
 
   const int kThreadsPerBlock = 256;
-  int block_cols = (std::min)(x_dim, kThreadsPerBlock);
-  int block_rows = (std::max)(kThreadsPerBlock / block_cols, 1);
+  // NOTE(zengjinle): cast std::min<int64_t> result to int is safe here, because
+  // kThreadsPerBlock is always very small.
+  int block_cols = std::min<int64_t>(x_dim, kThreadsPerBlock);
+  int block_rows = std::max<int64_t>(kThreadsPerBlock / block_cols, 1);
 
   int max_physical_threads = context.GetMaxPhysicalThreadCount();
   const int max_blocks = (std::max)(max_physical_threads / kThreadsPerBlock, 1);
@@ -172,9 +177,9 @@ inline GpuLaunchConfig GetGpuLaunchConfig2D(const phi::GPUContext& context,
   // Noticed, block size is not align to 32, if needed do it yourself.
   config.thread_per_block = dim3(block_cols, block_rows, 1);
 
-  int grid_x = (std::min)(DivUp(x_dim, block_cols), max_blocks);
-  int grid_y =
-      (std::min)(max_blocks / grid_x, (std::max)(y_dim / block_rows, 1));
+  int grid_x = std::min<int64_t>(DivUp<int64_t>(x_dim, block_cols), max_blocks);
+  int grid_y = std::min<int64_t>(max_blocks / grid_x,
+                                 std::max<int64_t>(y_dim / block_rows, 1));
 
   config.block_per_grid = dim3(grid_x, grid_y, 1);
   return config;
diff --git a/paddle/fluid/platform/device/gpu/rocm/rocm_helper.h b/paddle/fluid/platform/device/gpu/rocm/rocm_helper.h
index c0f6f173a798a2b1269d6a84c638dcbc6efec580..8bcae15d3517eb63ab2a1bacb8cca4b788911163 100644
--- a/paddle/fluid/platform/device/gpu/rocm/rocm_helper.h
+++ b/paddle/fluid/platform/device/gpu/rocm/rocm_helper.h
@@ -67,9 +67,10 @@ namespace platform {
  *
  */
 
-#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type)                     \
-  int64_t __index__ = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; \
-  for (index_type i = __index__; __index__ < (num);                   \
+#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type)                           \
+  int64_t __index__ =                                                       \
+      static_cast<int64_t>(hipBlockIdx_x) * hipBlockDim_x + hipThreadIdx_x; \
+  for (index_type i = __index__; __index__ < (num);                         \
        __index__ += hipBlockDim_x * hipGridDim_x, i = __index__)
 
 class CublasHandleHolder {
diff --git a/paddle/phi/backends/gpu/cuda/cuda_helper.h b/paddle/phi/backends/gpu/cuda/cuda_helper.h
index c62addfd257ab961803f4bccf33f689db3ffa90f..6d33d802b1880aeb87013a4c57fd1fb0fa23c9e9 100644
--- a/paddle/phi/backends/gpu/cuda/cuda_helper.h
+++ b/paddle/phi/backends/gpu/cuda/cuda_helper.h
@@ -62,9 +62,10 @@ namespace gpu {
  *
  */
 
-#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type)            \
-  int64_t __index__ = blockIdx.x * blockDim.x + threadIdx.x; \
-  for (index_type i = __index__; __index__ < (num);          \
+#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type)                  \
+  int64_t __index__ =                                              \
+      static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x; \
+  for (index_type i = __index__; __index__ < (num);                \
        __index__ += blockDim.x * gridDim.x, i = __index__)
 
 }  // namespace gpu
diff --git a/paddle/phi/backends/gpu/gpu_launch_config.h b/paddle/phi/backends/gpu/gpu_launch_config.h
index 6ea206178ca5639ad091826c33ca054c96046b08..f0a37d4fb7be6c40e1ef01ecaed44c000f94469e 100644
--- a/paddle/phi/backends/gpu/gpu_launch_config.h
+++ b/paddle/phi/backends/gpu/gpu_launch_config.h
@@ -162,8 +162,8 @@ inline GpuLaunchConfig GetGpuLaunchConfig1D(const phi::GPUContext& context,
 }
 
 inline GpuLaunchConfig GetGpuLaunchConfig2D(const phi::GPUContext& context,
-                                            int x_dim,
-                                            int y_dim) {
+                                            int64_t x_dim,
+                                            int64_t y_dim) {
   PADDLE_ENFORCE_GT(
       x_dim,
       0,
@@ -178,7 +178,7 @@ inline GpuLaunchConfig GetGpuLaunchConfig2D(const phi::GPUContext& context,
                                    y_dim));
 
   const int kThreadsPerBlock = 256;
-  int block_cols = std::min(x_dim, kThreadsPerBlock);
+  int block_cols = std::min<int64_t>(x_dim, kThreadsPerBlock);
   int block_rows = std::max(kThreadsPerBlock / block_cols, 1);
 
   int max_physical_threads = context.GetMaxPhysicalThreadCount();
@@ -188,8 +188,9 @@ inline GpuLaunchConfig GetGpuLaunchConfig2D(const phi::GPUContext& context,
   // Noticed, block size is not align to 32, if needed do it yourself.
   config.thread_per_block = dim3(block_cols, block_rows, 1);
 
-  int grid_x = std::min(DivUp<int>(x_dim, block_cols), max_blocks);
-  int grid_y = std::min(max_blocks / grid_x, std::max(y_dim / block_rows, 1));
+  int grid_x = std::min<int64_t>(DivUp<int64_t>(x_dim, block_cols), max_blocks);
+  int grid_y = std::min<int64_t>(max_blocks / grid_x,
+                                 std::max<int64_t>(y_dim / block_rows, 1));
 
   config.block_per_grid = dim3(grid_x, grid_y, 1);
   return config;
diff --git a/paddle/phi/backends/gpu/rocm/rocm_helper.h b/paddle/phi/backends/gpu/rocm/rocm_helper.h
index 14e9ca660bdf90362053111801c885c809e69475..e25dea28e36c101254d951d887188f6a0ce6cbd5 100644
--- a/paddle/phi/backends/gpu/rocm/rocm_helper.h
+++ b/paddle/phi/backends/gpu/rocm/rocm_helper.h
@@ -62,9 +62,10 @@ namespace gpu {
  *
  */
 
-#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type)                     \
-  int64_t __index__ = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; \
-  for (index_type i = __index__; __index__ < (num);                   \
+#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type)                           \
+  int64_t __index__ =                                                       \
+      static_cast<int64_t>(hipBlockIdx_x) * hipBlockDim_x + hipThreadIdx_x; \
+  for (index_type i = __index__; __index__ < (num);                         \
        __index__ += hipBlockDim_x * hipGridDim_x, i = __index__)
 
 }  // namespace gpu
diff --git a/paddle/phi/kernels/gpu/one_hot_kernel.cu b/paddle/phi/kernels/gpu/one_hot_kernel.cu
index 2ae9e9333ecb568e73115570fa08db77e162d85b..abe7757df7205c9ce6316ba30e582d1d655883f5 100644
--- a/paddle/phi/kernels/gpu/one_hot_kernel.cu
+++ b/paddle/phi/kernels/gpu/one_hot_kernel.cu
@@ -16,6 +16,7 @@
 
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
@@ -28,8 +29,7 @@ __global__ void FillOutputKernel(const InT* p_in_data,
                                  OutT* p_out_data,
                                  const int64_t numel,
                                  const int depth) {
-  int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx < numel) {
+  CUDA_KERNEL_LOOP_TYPE(idx, numel, int64_t) {
     PADDLE_ENFORCE(p_in_data[idx] >= 0 && p_in_data[idx] < depth,
                    "Illegal index value, Input(input) value should be "
                    "greater than or equal to 0, and less than depth [%d], "
@@ -62,9 +62,10 @@ struct OneHotV2OpCUDAFunctor {
     auto stream = ctx_.stream();
     funcs::set_constant(ctx_, out_, 0.0);
 
-    FillOutputKernel<<<(numel + PADDLE_CUDA_NUM_THREADS - 1) /
-                           PADDLE_CUDA_NUM_THREADS,
-                       PADDLE_CUDA_NUM_THREADS,
+    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(ctx_, numel);
+
+    FillOutputKernel<<<config.block_per_grid,
+                       config.thread_per_block,
                        0,
                        stream>>>(p_in_data, p_out_data, numel, depth_);
   }
diff --git a/paddle/phi/kernels/gpu/stack_kernel.cu b/paddle/phi/kernels/gpu/stack_kernel.cu
index e5c8d392e60ed39243611f242e3ed88edaa087a3..c079b61c06e944f076ee8ea6a86a32532e2a37e7 100644
--- a/paddle/phi/kernels/gpu/stack_kernel.cu
+++ b/paddle/phi/kernels/gpu/stack_kernel.cu
@@ -23,20 +23,23 @@ namespace phi {
 
 template <typename T, typename IntType>
 __global__ void StackCUDAKernel(T** input_ptrs,
-                                int split_size,
-                                int rows,
-                                int cols,
+                                IntType split_size,
+                                IntType rows,
+                                IntType cols,
                                 T* __restrict__ output) {
-  IntType grid_x = blockIdx.x * blockDim.x + threadIdx.x;
+  IntType grid_x = static_cast<IntType>(blockIdx.x) * blockDim.x + threadIdx.x;
+  IntType grid_x_stride = static_cast<IntType>(blockDim.x) * gridDim.x;
+  IntType grid_y_stride = static_cast<IntType>(blockDim.y) * gridDim.y;
 
-  for (; grid_x < cols; grid_x += blockDim.x * gridDim.x) {
-    IntType grid_y = blockIdx.y * blockDim.y + threadIdx.y;
+  for (; grid_x < cols; grid_x += grid_x_stride) {
+    IntType grid_y =
+        static_cast<IntType>(blockIdx.y) * blockDim.y + threadIdx.y;
 
     IntType split = grid_x / split_size;
     const T* input_ptr = input_ptrs[split];
     IntType col_offset = grid_x % split_size;
 #pragma unroll
-    for (; grid_y < rows; grid_y += blockDim.y * gridDim.y) {
+    for (; grid_y < rows; grid_y += grid_y_stride) {
       output[grid_y * cols + grid_x] =
           input_ptr[grid_y * split_size + col_offset];
     }
@@ -69,12 +72,12 @@ void StackKernel(const Context& dev_ctx,
                        dev_ctx.stream());
 
   // Split x dim from axis to matrix
-  int x_row = 1, x_col = 1;
+  int64_t x_row = 1, x_col = 1;
   for (int i = 0; i < axis; ++i) {
     x_row *= x[0]->dims()[i];
   }
   x_col = x[0]->numel() / x_row;
-  int out_col = x_col * n;
+  int64_t out_col = x_col * n;
 
   auto config =
       phi::backends::gpu::GetGpuLaunchConfig2D(dev_ctx, out_col, x_row);
@@ -85,9 +88,9 @@ void StackKernel(const Context& dev_ctx,
            config.thread_per_block,
            0,
            dev_ctx.stream()>>>(reinterpret_cast<T**>(tmp_x_data->ptr()),
-                               x_col,
-                               x_row,
-                               out_col,
+                               static_cast<int32_t>(x_col),
+                               static_cast<int32_t>(x_row),
+                               static_cast<int32_t>(out_col),
                                y_data);
   } else {
     StackCUDAKernel<T, int64_t>