未验证 提交 15577630 编写于 作者: Y Yiqun Liu 提交者: GitHub

Use int64_t in GetGpuLaunchConfig1D and ElementwiseKernel as index type to...

Use int64_t in GetGpuLaunchConfig1D and ElementwiseKernel as index type to support large tensor. (#43506)

* Change some data type from int to int64_t in GetGpuLaunchConfig1D to support large tensor.

* Use int64_t in ElementwiseKernel as index type to support large tensor.
上级 332fdd1e
......@@ -37,8 +37,7 @@
// HIP results in error or nan if > 256
#define PREDEFINED_BLOCK_SIZE 256
#else
/* CUDA performs better as thread_per_block
num is between [64, 512] */
// CUDA performs better when thread_per_block is between [64, 512]
#define PREDEFINED_BLOCK_SIZE 512
#endif
......@@ -46,22 +45,27 @@ namespace phi {
namespace backends {
namespace gpu {
inline int DivUp(int a, int b) { return (a + b - 1) / b; }
template <typename T = int64_t>
inline T DivUp(T a, T b) {
return (a + b - 1) / b;
}
/* https://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
for round integer value into next highest power of 2. */
static inline int RoundToPowerOfTwo(int n) {
// https://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
// for round integer value into next highest power of 2.
inline int64_t RoundToPowerOfTwo(int64_t n) {
n--;
n |= (n >> 1);
n |= (n >> 2);
n |= (n >> 4);
n |= (n >> 8);
n |= (n >> 16);
int64_t min_val = 32;
#ifdef __HIPCC__
return std::min(256, std::max(32, (n + 1)));
int64_t max_val = 256;
#else
return std::min(1024, std::max(32, (n + 1)));
int64_t max_val = 1024;
#endif
return std::min(max_val, std::max(min_val, (n + 1)));
}
#ifdef WITH_NV_JETSON
......@@ -106,12 +110,17 @@ inline GpuLaunchConfig GetGpuLaunchConfig1D(const phi::GPUContext& context,
PADDLE_ENFORCE_GE(numel,
0,
phi::errors::InvalidArgument(
"element quantity should be greater than or equal 0,"
" but received value is: %d.",
"numel is expected to be greater than or equal 0,"
" but received %d.",
numel));
PADDLE_ENFORCE_GE(
vec_size,
1,
phi::errors::InvalidArgument(
"vec_size is expected greater than 0, but received %d.", vec_size));
// Get compute_capability
const int capability = context.GetComputeCapability();
/* If thread number per block is 64/128/256/512, cuda performs better.*/
// If thread number per block is 64/128/256/512, cuda performs better.
int limit_threads =
std::min(PREDEFINED_BLOCK_SIZE, context.GetMaxThreadsPerBlock());
#ifdef WITH_NV_JETSON
......@@ -121,7 +130,7 @@ inline GpuLaunchConfig GetGpuLaunchConfig1D(const phi::GPUContext& context,
#endif
int threads = limit_threads;
int sm_count = context.GetSMCount();
int active_threads_num = numel / vec_size;
int64_t active_threads_num = numel / vec_size;
if (active_threads_num / (sm_count << 1) < limit_threads) {
// Round up threads number into an exponential multiple of 2, while number
// of acitve blocks is about twice of SM, to acquire better performance.
......@@ -133,7 +142,7 @@ inline GpuLaunchConfig GetGpuLaunchConfig1D(const phi::GPUContext& context,
}
// Number of threads per block shall be larger than 64.
threads = std::max(64, threads);
int blocks = DivUp(DivUp(numel, vec_size), threads);
int blocks = DivUp<int64_t>(DivUp<int64_t>(numel, vec_size), threads);
int limit_blocks = context.GetCUDAMaxGridDimSize()[0];
if (blocks > limit_blocks) {
blocks = limit_blocks;
......@@ -143,6 +152,11 @@ inline GpuLaunchConfig GetGpuLaunchConfig1D(const phi::GPUContext& context,
config.thread_per_block.x = threads;
config.block_per_grid.x = blocks;
config.compute_capability = capability;
VLOG(3) << "Get 1-D launch config: numel=" << numel
<< ", vec_size=" << vec_size << ", block_size=" << threads
<< ", grid_size=" << blocks << ", limit_blocks=" << limit_blocks
<< ", limit_threads=" << limit_threads;
return config;
}
......@@ -163,19 +177,18 @@ inline GpuLaunchConfig GetGpuLaunchConfig2D(const phi::GPUContext& context,
y_dim));
const int kThreadsPerBlock = 256;
int block_cols = (std::min)(x_dim, kThreadsPerBlock);
int block_rows = (std::max)(kThreadsPerBlock / block_cols, 1);
int block_cols = std::min(x_dim, kThreadsPerBlock);
int block_rows = std::max(kThreadsPerBlock / block_cols, 1);
int max_physical_threads = context.GetMaxPhysicalThreadCount();
const int max_blocks = (std::max)(max_physical_threads / kThreadsPerBlock, 1);
const int max_blocks = std::max(max_physical_threads / kThreadsPerBlock, 1);
GpuLaunchConfig config;
// Noticed, block size is not align to 32, if needed do it yourself.
config.thread_per_block = dim3(block_cols, block_rows, 1);
int grid_x = (std::min)(DivUp(x_dim, block_cols), max_blocks);
int grid_y =
(std::min)(max_blocks / grid_x, (std::max)(y_dim / block_rows, 1));
int grid_x = std::min(DivUp<int>(x_dim, block_cols), max_blocks);
int grid_y = std::min(max_blocks / grid_x, std::max(y_dim / block_rows, 1));
config.block_per_grid = dim3(grid_x, grid_y, 1);
return config;
......@@ -202,13 +215,10 @@ inline GpuLaunchConfig GetGpuLaunchConfig3D(const phi::GPUContext& context,
int block_y = std::min(GetLastPow2(height), max_threads / block_x);
int block_z = std::min(num_img, max_threads / block_x / block_y);
auto max_grid_dim = context.GetCUDAMaxGridDimSize();
int grid_x =
std::min<int>(max_grid_dim[0], backends::gpu::DivUp(width, block_x));
int grid_y =
std::min<int>(max_grid_dim[1], backends::gpu::DivUp(height, block_y));
int grid_z = std::min<int>(max_grid_dim[2],
backends::gpu::DivUp(num_img, block_z * 4));
std::array<int, 3> max_grid_dim = context.GetCUDAMaxGridDimSize();
int grid_x = std::min(max_grid_dim[0], DivUp<int>(width, block_x));
int grid_y = std::min(max_grid_dim[1], DivUp<int>(height, block_y));
int grid_z = std::min(max_grid_dim[2], DivUp<int>(num_img, block_z * 4));
const int capability = context.GetComputeCapability();
GpuLaunchConfig config;
......
......@@ -511,8 +511,8 @@ struct Loader {
template <typename Array, typename ArgsT>
static __device__ void Apply(const Array &in,
ArgsT *args,
kps::IndexType offset,
int num,
int data_offset,
int read_lens,
bool is_boundary) {
using Type = std::tuple_element_t<Index, ArgsT>;
......@@ -521,13 +521,13 @@ struct Loader {
if (is_boundary) {
kps::ReadData<Type, VecSize, 1, 1, ArgsT, Index, true>(
args,
reinterpret_cast<const _ptr_ Type *>(in[Index]) + data_offset,
reinterpret_cast<const _ptr_ Type *>(in[Index]) + offset,
num,
read_lens);
} else {
kps::ReadData<Type, VecSize, 1, 1, ArgsT, Index, false>(
args,
reinterpret_cast<const _ptr_ Type *>(in[Index]) + data_offset,
reinterpret_cast<const _ptr_ Type *>(in[Index]) + offset,
num,
read_lens);
}
......@@ -681,46 +681,12 @@ struct SameDimsElementwisePrimitiveCaller {
}
};
template <typename OutT, int VecSize, bool IsBoundary, int NumOuts>
struct ElementwiseWriteDataCaller {
__device__ __forceinline__ void operator()(
phi::Array<_ptr_ OutT *, NumOuts> outs,
ConditionalT<OutT, NumOuts> src[VecSize],
int block_offset,
int num) {
OutT dst[NumOuts][VecSize];
#pragma unroll
for (int i = 0; i < VecSize; ++i) {
#pragma unroll
for (int j = 0; j < NumOuts; ++j) {
dst[j][i] = (src[i])[j];
}
}
#pragma unroll
for (int i = 0; i < NumOuts; ++i) {
kps::WriteData<OutT, VecSize, 1, 1, IsBoundary>(
outs[i] + block_offset, dst[i], num);
}
}
};
template <typename OutT, int VecSize, bool IsBoundary>
struct ElementwiseWriteDataCaller<OutT, VecSize, IsBoundary, 1> {
__device__ __forceinline__ void operator()(phi::Array<_ptr_ OutT *, 1> outs,
OutT src[VecSize],
int block_offset,
int num) {
kps::WriteData<OutT, VecSize, 1, 1, IsBoundary>(
outs[0] + block_offset, src, num);
}
};
template <typename OutT, int VecSize, bool IsBoundary, int NumOuts>
struct ElementwiseWriteDataCallerBc {
__device__ __forceinline__ void operator()(
phi::Array<_ptr_ OutT *, NumOuts> outs,
ConditionalT<OutT, NumOuts> src[VecSize],
int block_offset,
kps::IndexType block_offset,
int num,
int read_lens) {
OutT dst[NumOuts][VecSize];
......@@ -743,7 +709,7 @@ template <typename OutT, int VecSize, bool IsBoundary>
struct ElementwiseWriteDataCallerBc<OutT, VecSize, IsBoundary, 1> {
__device__ __forceinline__ void operator()(phi::Array<_ptr_ OutT *, 1> outs,
OutT src[VecSize],
int block_offset,
kps::IndexType block_offset,
int num,
int read_lens) {
kps::WriteData<OutT, VecSize, 1, 1, IsBoundary>(
......@@ -758,11 +724,10 @@ template <typename OutT,
int VecSize,
bool IsBoundary>
__device__ void VectorizedElementwiseKernelImpl(
const phi::Array<const _ptr_ char *__restrict__, Arity> &in,
phi::Array<_ptr_ OutT *, NumOuts> outs,
kps::IndexType offset,
int num,
int data_offset,
int read_lens,
Functor func) {
using Traits = paddle::platform::FunctionTraits<Functor>;
......@@ -771,7 +736,7 @@ __device__ void VectorizedElementwiseKernelImpl(
ConditionalT<OutT, NumOuts> result[VecSize];
Unroller<Loader, VecSize, Arity>::step(
in, args, num, data_offset, read_lens, IsBoundary);
in, args, offset, num, read_lens, IsBoundary);
SameDimsElementwisePrimitiveCaller<ConditionalT<OutT, NumOuts>,
VecSize,
......@@ -780,19 +745,19 @@ __device__ void VectorizedElementwiseKernelImpl(
Arity>()(func, args, result, read_lens);
ElementwiseWriteDataCallerBc<OutT, VecSize, IsBoundary, NumOuts>()(
outs, result, data_offset, num, read_lens);
outs, result, offset, num, read_lens);
}
template <typename OutT, typename Functor, int Arity, int NumOuts, int VecSize>
__global__ void VectorizedElementwiseKernel(
phi::Array<const _ptr_ char *__restrict__, Arity> ins,
phi::Array<_ptr_ OutT *, NumOuts> outs,
int size,
int main_offset,
kps::IndexType numel,
kps::IndexType main_offset,
int read_lens,
Functor func) {
int data_offset = BLOCK_ID_X * BLOCK_NUM_X * read_lens;
int stride = BLOCK_NUM_X * GRID_NUM_X * read_lens;
kps::IndexType data_offset = BLOCK_ID_X * BLOCK_NUM_X * read_lens;
kps::IndexType stride = BLOCK_NUM_X * GRID_NUM_X * read_lens;
for (; data_offset < main_offset; data_offset += stride) {
VectorizedElementwiseKernelImpl<OutT,
Functor,
......@@ -800,29 +765,31 @@ __global__ void VectorizedElementwiseKernel(
NumOuts,
VecSize,
false>(
ins, outs, read_lens * BLOCK_NUM_X, data_offset, read_lens, func);
ins, outs, data_offset, read_lens * BLOCK_NUM_X, read_lens, func);
}
int num = size - data_offset;
if (num > 0) {
int remain = numel - data_offset;
if (remain > 0) {
VectorizedElementwiseKernelImpl<OutT,
Functor,
Arity,
NumOuts,
VecSize,
true>(
ins, outs, num, data_offset, read_lens, func);
ins, outs, data_offset, remain, read_lens, func);
}
}
template <typename OutT, typename Functor, int Arity, int NumOuts, int VecSize>
void ElementwiseCudaKernel(const KPDevice &ctx,
const std::vector<const DenseTensor *> &ins,
std::vector<DenseTensor *> *outs,
int read_lens,
Functor func) {
auto numel =
(*outs)[0]->numel(); // To avoid running errors when ins.size()== 0
void LaunchElementwiseCudaKernel(const KPDevice &ctx,
const std::vector<const DenseTensor *> &ins,
std::vector<DenseTensor *> *outs,
int read_lens,
Functor func) {
// There are at least 1 output, but maybe 0 input (ins.size() == 0).
// For large tensor numel * sizeof(T) > 2^31, we must use int64_t as index
// type.
int64_t numel = (*outs)[0]->numel();
phi::Array<const _ptr_ char *__restrict__, Arity> ins_data;
phi::Array<_ptr_ OutT *, NumOuts> outs_data;
......@@ -834,15 +801,16 @@ void ElementwiseCudaKernel(const KPDevice &ctx,
int block_size = 64;
int grid_size = 8;
auto stream = ctx.x_context()->xpu_stream;
int main_offset = (numel / (read_lens * block_size)) * read_lens * block_size;
int64_t main_offset =
(numel / (read_lens * block_size)) * read_lens * block_size;
VectorizedElementwiseKernel<OutT, Functor, Arity, NumOuts, VecSize>
<<<grid_size, block_size, 0, stream>>>(
ins_data, outs_data, numel, main_offset, read_lens, func);
#else
auto gpu_config =
phi::backends::gpu::GetGpuLaunchConfig1D(ctx, numel, VecSize);
int main_offset = (numel / (VecSize * gpu_config.GetBlockSize())) * VecSize *
gpu_config.GetBlockSize();
int64_t main_offset = (numel / (VecSize * gpu_config.GetBlockSize())) *
VecSize * gpu_config.GetBlockSize();
auto stream = ctx.stream();
VectorizedElementwiseKernel<OutT, Functor, Arity, NumOuts, VecSize>
<<<gpu_config.block_per_grid, gpu_config.thread_per_block, 0, stream>>>(
......@@ -901,15 +869,15 @@ void ElementwiseKernel(const KPDevice &ctx,
#endif
switch (vec_size) {
case VecSizeL:
ElementwiseCudaKernel<OutT, Functor, kArity, NumOuts, VecSizeL>(
LaunchElementwiseCudaKernel<OutT, Functor, kArity, NumOuts, VecSizeL>(
ctx, ins, outs, read_lens, func);
break;
case VecSizeM:
ElementwiseCudaKernel<OutT, Functor, kArity, NumOuts, VecSizeM>(
LaunchElementwiseCudaKernel<OutT, Functor, kArity, NumOuts, VecSizeM>(
ctx, ins, outs, read_lens, func);
break;
case VecSizeS:
ElementwiseCudaKernel<OutT, Functor, kArity, NumOuts, VecSizeS>(
LaunchElementwiseCudaKernel<OutT, Functor, kArity, NumOuts, VecSizeS>(
ctx, ins, outs, read_lens, func);
break;
default: {
......
......@@ -87,3 +87,16 @@
#include "paddle/phi/kernels/primitive/functor_primitives.h"
#endif
namespace phi {
namespace kps {
#ifdef PADDLE_WITH_XPU_KP
// The type of index used in kernel
using IndexType = int;
#else
using IndexType = int64_t;
#endif
} // namespace kps
} // namespace phi
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册