Use int64_t in GetGpuLaunchConfig1D and ElementwiseKernel as index type to...

Use int64_t in GetGpuLaunchConfig1D and ElementwiseKernel as index type to support large tensor. (#43506) * Change some data type from int to int64_t in GetGpuLaunchConfig1D to support large tensor. * Use int64_t in ElementwiseKernel as index type to support large tensor.

Use int64_t in GetGpuLaunchConfig1D and ElementwiseKernel as index type to...
Use int64_t in GetGpuLaunchConfig1D and ElementwiseKernel as index type to support large tensor. (#43506) * Change some data type from int to int64_t in GetGpuLaunchConfig1D to support large tensor. * Use int64_t in ElementwiseKernel as index type to support large tensor.
15577630 · Yiqun Liu · GitHub · 332fdd1e · 15577630 · 15577630
3 changed file
--- a/paddle/phi/backends/gpu/gpu_launch_config.h
+++ b/paddle/phi/backends/gpu/gpu_launch_config.h
@@ -37,8 +37,7 @@
 // HIP results in error or nan if > 256
 #define PREDEFINED_BLOCK_SIZE 256
 #else
-/* CUDA performs better as thread_per_block
+// CUDA performs better when thread_per_block is between [64, 512]
-   num is between [64, 512] */
 #define PREDEFINED_BLOCK_SIZE 512
 #endif
@@ -46,22 +45,27 @@ namespace phi {
 namespace backends {
 namespace gpu {
-inline int DivUp(int a, int b) { return (a + b - 1) / b; }
+template <typename T = int64_t>
+inline T DivUp(T a, T b) {
+  return (a + b - 1) / b;
+}
-/* https://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
+// https://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
-   for round integer value into next highest power of 2. */
+//   for round integer value into next highest power of 2.
-static inline int RoundToPowerOfTwo(int n) {
+inline int64_t RoundToPowerOfTwo(int64_t n) {
  n--;
  n |= (n >> 1);
  n |= (n >> 2);
  n |= (n >> 4);
  n |= (n >> 8);
  n |= (n >> 16);
+  int64_t min_val = 32;
 #ifdef __HIPCC__
-  return std::min(256, std::max(32, (n + 1)));
+  int64_t max_val = 256;
 #else
-  return std::min(1024, std::max(32, (n + 1)));
+  int64_t max_val = 1024;
 #endif
+  return std::min(max_val, std::max(min_val, (n + 1)));
 }
 #ifdef WITH_NV_JETSON
@@ -106,12 +110,17 @@ inline GpuLaunchConfig GetGpuLaunchConfig1D(const phi::GPUContext& context,
  PADDLE_ENFORCE_GE(numel,
                    0,
                    phi::errors::InvalidArgument(
-                        "element quantity should be greater than or equal 0,"
+                        "numel is expected to be greater than or equal 0,"
-                        " but received value is: %d.",
+                        " but received %d.",
                        numel));
+  PADDLE_ENFORCE_GE(
+      vec_size,
+      1,
+      phi::errors::InvalidArgument(
+          "vec_size is expected greater than 0, but received %d.", vec_size));
  // Get compute_capability
  const int capability = context.GetComputeCapability();
-  /* If thread number per block is 64/128/256/512, cuda performs better.*/
+  // If thread number per block is 64/128/256/512, cuda performs better.
  int limit_threads =
      std::min(PREDEFINED_BLOCK_SIZE, context.GetMaxThreadsPerBlock());
 #ifdef WITH_NV_JETSON
@@ -121,7 +130,7 @@ inline GpuLaunchConfig GetGpuLaunchConfig1D(const phi::GPUContext& context,
 #endif
  int threads = limit_threads;
  int sm_count = context.GetSMCount();
-  int active_threads_num = numel / vec_size;
+  int64_t active_threads_num = numel / vec_size;
  if (active_threads_num / (sm_count << 1) < limit_threads) {
    // Round up threads number into an exponential multiple of 2, while number
    // of acitve blocks is about twice of SM, to acquire better performance.
@@ -133,7 +142,7 @@ inline GpuLaunchConfig GetGpuLaunchConfig1D(const phi::GPUContext& context,
  }
  // Number of threads per block shall be larger than 64.
  threads = std::max(64, threads);
-  int blocks = DivUp(DivUp(numel, vec_size), threads);
+  int blocks = DivUp<int64_t>(DivUp<int64_t>(numel, vec_size), threads);
  int limit_blocks = context.GetCUDAMaxGridDimSize()[0];
  if (blocks > limit_blocks) {
    blocks = limit_blocks;
@@ -143,6 +152,11 @@ inline GpuLaunchConfig GetGpuLaunchConfig1D(const phi::GPUContext& context,
  config.thread_per_block.x = threads;
  config.block_per_grid.x = blocks;
  config.compute_capability = capability;
+  VLOG(3) << "Get 1-D launch config: numel=" << numel
+          << ", vec_size=" << vec_size << ", block_size=" << threads
+          << ", grid_size=" << blocks << ", limit_blocks=" << limit_blocks
+          << ", limit_threads=" << limit_threads;
  return config;
 }
@@ -163,19 +177,18 @@ inline GpuLaunchConfig GetGpuLaunchConfig2D(const phi::GPUContext& context,
                                   y_dim));
  const int kThreadsPerBlock = 256;
-  int block_cols = (std::min)(x_dim, kThreadsPerBlock);
+  int block_cols = std::min(x_dim, kThreadsPerBlock);
-  int block_rows = (std::max)(kThreadsPerBlock / block_cols, 1);
+  int block_rows = std::max(kThreadsPerBlock / block_cols, 1);
  int max_physical_threads = context.GetMaxPhysicalThreadCount();
-  const int max_blocks = (std::max)(max_physical_threads / kThreadsPerBlock, 1);
+  const int max_blocks = std::max(max_physical_threads / kThreadsPerBlock, 1);
  GpuLaunchConfig config;
  // Noticed, block size is not align to 32, if needed do it yourself.
  config.thread_per_block = dim3(block_cols, block_rows, 1);
-  int grid_x = (std::min)(DivUp(x_dim, block_cols), max_blocks);
+  int grid_x = std::min(DivUp<int>(x_dim, block_cols), max_blocks);
-  int grid_y =
+  int grid_y = std::min(max_blocks / grid_x, std::max(y_dim / block_rows, 1));
-      (std::min)(max_blocks / grid_x, (std::max)(y_dim / block_rows, 1));
  config.block_per_grid = dim3(grid_x, grid_y, 1);
  return config;
@@ -202,13 +215,10 @@ inline GpuLaunchConfig GetGpuLaunchConfig3D(const phi::GPUContext& context,
  int block_y = std::min(GetLastPow2(height), max_threads / block_x);
  int block_z = std::min(num_img, max_threads / block_x / block_y);
-  auto max_grid_dim = context.GetCUDAMaxGridDimSize();
+  std::array<int, 3> max_grid_dim = context.GetCUDAMaxGridDimSize();
-  int grid_x =
+  int grid_x = std::min(max_grid_dim[0], DivUp<int>(width, block_x));
-      std::min<int>(max_grid_dim[0], backends::gpu::DivUp(width, block_x));
+  int grid_y = std::min(max_grid_dim[1], DivUp<int>(height, block_y));
-  int grid_y =
+  int grid_z = std::min(max_grid_dim[2], DivUp<int>(num_img, block_z * 4));
-      std::min<int>(max_grid_dim[1], backends::gpu::DivUp(height, block_y));
-  int grid_z = std::min<int>(max_grid_dim[2],
-                             backends::gpu::DivUp(num_img, block_z * 4));
  const int capability = context.GetComputeCapability();
  GpuLaunchConfig config;

--- a/paddle/phi/kernels/funcs/elementwise_base.h
+++ b/paddle/phi/kernels/funcs/elementwise_base.h
@@ -511,8 +511,8 @@ struct Loader {
  template <typename Array, typename ArgsT>
  static __device__ void Apply(const Array &in,
                               ArgsT *args,
+                               kps::IndexType offset,
                               int num,
-                               int data_offset,
                               int read_lens,
                               bool is_boundary) {
    using Type = std::tuple_element_t<Index, ArgsT>;
@@ -521,13 +521,13 @@ struct Loader {
    if (is_boundary) {
      kps::ReadData<Type, VecSize, 1, 1, ArgsT, Index, true>(
          args,
-          reinterpret_cast<const _ptr_ Type *>(in[Index]) + data_offset,
+          reinterpret_cast<const _ptr_ Type *>(in[Index]) + offset,
          num,
          read_lens);
    } else {
      kps::ReadData<Type, VecSize, 1, 1, ArgsT, Index, false>(
          args,
-          reinterpret_cast<const _ptr_ Type *>(in[Index]) + data_offset,
+          reinterpret_cast<const _ptr_ Type *>(in[Index]) + offset,
          num,
          read_lens);
    }
@@ -681,46 +681,12 @@ struct SameDimsElementwisePrimitiveCaller {
  }
 };
-template <typename OutT, int VecSize, bool IsBoundary, int NumOuts>
-struct ElementwiseWriteDataCaller {
-  __device__ __forceinline__ void operator()(
-      phi::Array<_ptr_ OutT *, NumOuts> outs,
-      ConditionalT<OutT, NumOuts> src[VecSize],
-      int block_offset,
-      int num) {
-    OutT dst[NumOuts][VecSize];
-#pragma unroll
-    for (int i = 0; i < VecSize; ++i) {
-#pragma unroll
-      for (int j = 0; j < NumOuts; ++j) {
-        dst[j][i] = (src[i])[j];
-      }
-    }
-#pragma unroll
-    for (int i = 0; i < NumOuts; ++i) {
-      kps::WriteData<OutT, VecSize, 1, 1, IsBoundary>(
-          outs[i] + block_offset, dst[i], num);
-    }
-  }
-};
-template <typename OutT, int VecSize, bool IsBoundary>
-struct ElementwiseWriteDataCaller<OutT, VecSize, IsBoundary, 1> {
-  __device__ __forceinline__ void operator()(phi::Array<_ptr_ OutT *, 1> outs,
-                                             OutT src[VecSize],
-                                             int block_offset,
-                                             int num) {
-    kps::WriteData<OutT, VecSize, 1, 1, IsBoundary>(
-        outs[0] + block_offset, src, num);
-  }
-};
 template <typename OutT, int VecSize, bool IsBoundary, int NumOuts>
 struct ElementwiseWriteDataCallerBc {
  __device__ __forceinline__ void operator()(
      phi::Array<_ptr_ OutT *, NumOuts> outs,
      ConditionalT<OutT, NumOuts> src[VecSize],
-      int block_offset,
+      kps::IndexType block_offset,
      int num,
      int read_lens) {
    OutT dst[NumOuts][VecSize];
@@ -743,7 +709,7 @@ template <typename OutT, int VecSize, bool IsBoundary>
 struct ElementwiseWriteDataCallerBc<OutT, VecSize, IsBoundary, 1> {
  __device__ __forceinline__ void operator()(phi::Array<_ptr_ OutT *, 1> outs,
                                             OutT src[VecSize],
-                                             int block_offset,
+                                             kps::IndexType block_offset,
                                             int num,
                                             int read_lens) {
    kps::WriteData<OutT, VecSize, 1, 1, IsBoundary>(
@@ -758,11 +724,10 @@ template <typename OutT,
          int VecSize,
          bool IsBoundary>
 __device__ void VectorizedElementwiseKernelImpl(
    const phi::Array<const _ptr_ char *__restrict__, Arity> &in,
    phi::Array<_ptr_ OutT *, NumOuts> outs,
+    kps::IndexType offset,
    int num,
-    int data_offset,
    int read_lens,
    Functor func) {
  using Traits = paddle::platform::FunctionTraits<Functor>;
@@ -771,7 +736,7 @@ __device__ void VectorizedElementwiseKernelImpl(
  ConditionalT<OutT, NumOuts> result[VecSize];
  Unroller<Loader, VecSize, Arity>::step(
-      in, args, num, data_offset, read_lens, IsBoundary);
+      in, args, offset, num, read_lens, IsBoundary);
  SameDimsElementwisePrimitiveCaller<ConditionalT<OutT, NumOuts>,
                                     VecSize,
@@ -780,19 +745,19 @@ __device__ void VectorizedElementwiseKernelImpl(
                                     Arity>()(func, args, result, read_lens);
  ElementwiseWriteDataCallerBc<OutT, VecSize, IsBoundary, NumOuts>()(
-      outs, result, data_offset, num, read_lens);
+      outs, result, offset, num, read_lens);
 }
 template <typename OutT, typename Functor, int Arity, int NumOuts, int VecSize>
 __global__ void VectorizedElementwiseKernel(
    phi::Array<const _ptr_ char *__restrict__, Arity> ins,
    phi::Array<_ptr_ OutT *, NumOuts> outs,
-    int size,
+    kps::IndexType numel,
-    int main_offset,
+    kps::IndexType main_offset,
    int read_lens,
    Functor func) {
-  int data_offset = BLOCK_ID_X * BLOCK_NUM_X * read_lens;
+  kps::IndexType data_offset = BLOCK_ID_X * BLOCK_NUM_X * read_lens;
-  int stride = BLOCK_NUM_X * GRID_NUM_X * read_lens;
+  kps::IndexType stride = BLOCK_NUM_X * GRID_NUM_X * read_lens;
  for (; data_offset < main_offset; data_offset += stride) {
    VectorizedElementwiseKernelImpl<OutT,
                                    Functor,
@@ -800,29 +765,31 @@ __global__ void VectorizedElementwiseKernel(
                                    NumOuts,
                                    VecSize,
                                    false>(
-        ins, outs, read_lens * BLOCK_NUM_X, data_offset, read_lens, func);
+        ins, outs, data_offset, read_lens * BLOCK_NUM_X, read_lens, func);
  }
-  int num = size - data_offset;
+  int remain = numel - data_offset;
-  if (num > 0) {
+  if (remain > 0) {
    VectorizedElementwiseKernelImpl<OutT,
                                    Functor,
                                    Arity,
                                    NumOuts,
                                    VecSize,
                                    true>(
-        ins, outs, num, data_offset, read_lens, func);
+        ins, outs, data_offset, remain, read_lens, func);
  }
 }
 template <typename OutT, typename Functor, int Arity, int NumOuts, int VecSize>
-void ElementwiseCudaKernel(const KPDevice &ctx,
+void LaunchElementwiseCudaKernel(const KPDevice &ctx,
-                           const std::vector<const DenseTensor *> &ins,
+                                 const std::vector<const DenseTensor *> &ins,
-                           std::vector<DenseTensor *> *outs,
+                                 std::vector<DenseTensor *> *outs,
-                           int read_lens,
+                                 int read_lens,
-                           Functor func) {
+                                 Functor func) {
-  auto numel =
+  // There are at least 1 output, but maybe 0 input (ins.size() == 0).
-      (*outs)[0]->numel();  // To avoid running errors when ins.size()== 0
+  // For large tensor numel * sizeof(T) > 2^31, we must use int64_t as index
+  // type.
+  int64_t numel = (*outs)[0]->numel();
  phi::Array<const _ptr_ char *__restrict__, Arity> ins_data;
  phi::Array<_ptr_ OutT *, NumOuts> outs_data;
@@ -834,15 +801,16 @@ void ElementwiseCudaKernel(const KPDevice &ctx,
  int block_size = 64;
  int grid_size = 8;
  auto stream = ctx.x_context()->xpu_stream;
-  int main_offset = (numel / (read_lens * block_size)) * read_lens * block_size;
+  int64_t main_offset =
+      (numel / (read_lens * block_size)) * read_lens * block_size;
  VectorizedElementwiseKernel<OutT, Functor, Arity, NumOuts, VecSize>
      <<<grid_size, block_size, 0, stream>>>(
          ins_data, outs_data, numel, main_offset, read_lens, func);
 #else
  auto gpu_config =
      phi::backends::gpu::GetGpuLaunchConfig1D(ctx, numel, VecSize);
-  int main_offset = (numel / (VecSize * gpu_config.GetBlockSize())) * VecSize *
+  int64_t main_offset = (numel / (VecSize * gpu_config.GetBlockSize())) *
-                    gpu_config.GetBlockSize();
+                        VecSize * gpu_config.GetBlockSize();
  auto stream = ctx.stream();
  VectorizedElementwiseKernel<OutT, Functor, Arity, NumOuts, VecSize>
      <<<gpu_config.block_per_grid, gpu_config.thread_per_block, 0, stream>>>(
@@ -901,15 +869,15 @@ void ElementwiseKernel(const KPDevice &ctx,
 #endif
  switch (vec_size) {
    case VecSizeL:
-      ElementwiseCudaKernel<OutT, Functor, kArity, NumOuts, VecSizeL>(
+      LaunchElementwiseCudaKernel<OutT, Functor, kArity, NumOuts, VecSizeL>(
          ctx, ins, outs, read_lens, func);
      break;
    case VecSizeM:
-      ElementwiseCudaKernel<OutT, Functor, kArity, NumOuts, VecSizeM>(
+      LaunchElementwiseCudaKernel<OutT, Functor, kArity, NumOuts, VecSizeM>(
          ctx, ins, outs, read_lens, func);
      break;
    case VecSizeS:
-      ElementwiseCudaKernel<OutT, Functor, kArity, NumOuts, VecSizeS>(
+      LaunchElementwiseCudaKernel<OutT, Functor, kArity, NumOuts, VecSizeS>(
          ctx, ins, outs, read_lens, func);
      break;
    default: {

--- a/paddle/phi/kernels/primitive/kernel_primitives.h
+++ b/paddle/phi/kernels/primitive/kernel_primitives.h
@@ -87,3 +87,16 @@
 #include "paddle/phi/kernels/primitive/functor_primitives.h"
 #endif
+namespace phi {
+namespace kps {
+#ifdef PADDLE_WITH_XPU_KP
+// The type of index used in kernel
+using IndexType = int;
+#else
+using IndexType = int64_t;
+#endif
+}  // namespace kps
+}  // namespace phi