未验证 提交 15577630 编写于 作者: Y Yiqun Liu 提交者: GitHub

Use int64_t in GetGpuLaunchConfig1D and ElementwiseKernel as index type to...

Use int64_t in GetGpuLaunchConfig1D and ElementwiseKernel as index type to support large tensor. (#43506)

* Change some data type from int to int64_t in GetGpuLaunchConfig1D to support large tensor.

* Use int64_t in ElementwiseKernel as index type to support large tensor.
上级 332fdd1e
...@@ -37,8 +37,7 @@ ...@@ -37,8 +37,7 @@
// HIP results in error or nan if > 256 // HIP results in error or nan if > 256
#define PREDEFINED_BLOCK_SIZE 256 #define PREDEFINED_BLOCK_SIZE 256
#else #else
/* CUDA performs better as thread_per_block // CUDA performs better when thread_per_block is between [64, 512]
num is between [64, 512] */
#define PREDEFINED_BLOCK_SIZE 512 #define PREDEFINED_BLOCK_SIZE 512
#endif #endif
...@@ -46,22 +45,27 @@ namespace phi { ...@@ -46,22 +45,27 @@ namespace phi {
namespace backends { namespace backends {
namespace gpu { namespace gpu {
inline int DivUp(int a, int b) { return (a + b - 1) / b; } template <typename T = int64_t>
inline T DivUp(T a, T b) {
return (a + b - 1) / b;
}
/* https://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2 // https://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
for round integer value into next highest power of 2. */ // for round integer value into next highest power of 2.
static inline int RoundToPowerOfTwo(int n) { inline int64_t RoundToPowerOfTwo(int64_t n) {
n--; n--;
n |= (n >> 1); n |= (n >> 1);
n |= (n >> 2); n |= (n >> 2);
n |= (n >> 4); n |= (n >> 4);
n |= (n >> 8); n |= (n >> 8);
n |= (n >> 16); n |= (n >> 16);
int64_t min_val = 32;
#ifdef __HIPCC__ #ifdef __HIPCC__
return std::min(256, std::max(32, (n + 1))); int64_t max_val = 256;
#else #else
return std::min(1024, std::max(32, (n + 1))); int64_t max_val = 1024;
#endif #endif
return std::min(max_val, std::max(min_val, (n + 1)));
} }
#ifdef WITH_NV_JETSON #ifdef WITH_NV_JETSON
...@@ -106,12 +110,17 @@ inline GpuLaunchConfig GetGpuLaunchConfig1D(const phi::GPUContext& context, ...@@ -106,12 +110,17 @@ inline GpuLaunchConfig GetGpuLaunchConfig1D(const phi::GPUContext& context,
PADDLE_ENFORCE_GE(numel, PADDLE_ENFORCE_GE(numel,
0, 0,
phi::errors::InvalidArgument( phi::errors::InvalidArgument(
"element quantity should be greater than or equal 0," "numel is expected to be greater than or equal 0,"
" but received value is: %d.", " but received %d.",
numel)); numel));
PADDLE_ENFORCE_GE(
vec_size,
1,
phi::errors::InvalidArgument(
"vec_size is expected greater than 0, but received %d.", vec_size));
// Get compute_capability // Get compute_capability
const int capability = context.GetComputeCapability(); const int capability = context.GetComputeCapability();
/* If thread number per block is 64/128/256/512, cuda performs better.*/ // If thread number per block is 64/128/256/512, cuda performs better.
int limit_threads = int limit_threads =
std::min(PREDEFINED_BLOCK_SIZE, context.GetMaxThreadsPerBlock()); std::min(PREDEFINED_BLOCK_SIZE, context.GetMaxThreadsPerBlock());
#ifdef WITH_NV_JETSON #ifdef WITH_NV_JETSON
...@@ -121,7 +130,7 @@ inline GpuLaunchConfig GetGpuLaunchConfig1D(const phi::GPUContext& context, ...@@ -121,7 +130,7 @@ inline GpuLaunchConfig GetGpuLaunchConfig1D(const phi::GPUContext& context,
#endif #endif
int threads = limit_threads; int threads = limit_threads;
int sm_count = context.GetSMCount(); int sm_count = context.GetSMCount();
int active_threads_num = numel / vec_size; int64_t active_threads_num = numel / vec_size;
if (active_threads_num / (sm_count << 1) < limit_threads) { if (active_threads_num / (sm_count << 1) < limit_threads) {
// Round up threads number into an exponential multiple of 2, while number // Round up threads number into an exponential multiple of 2, while number
// of acitve blocks is about twice of SM, to acquire better performance. // of acitve blocks is about twice of SM, to acquire better performance.
...@@ -133,7 +142,7 @@ inline GpuLaunchConfig GetGpuLaunchConfig1D(const phi::GPUContext& context, ...@@ -133,7 +142,7 @@ inline GpuLaunchConfig GetGpuLaunchConfig1D(const phi::GPUContext& context,
} }
// Number of threads per block shall be larger than 64. // Number of threads per block shall be larger than 64.
threads = std::max(64, threads); threads = std::max(64, threads);
int blocks = DivUp(DivUp(numel, vec_size), threads); int blocks = DivUp<int64_t>(DivUp<int64_t>(numel, vec_size), threads);
int limit_blocks = context.GetCUDAMaxGridDimSize()[0]; int limit_blocks = context.GetCUDAMaxGridDimSize()[0];
if (blocks > limit_blocks) { if (blocks > limit_blocks) {
blocks = limit_blocks; blocks = limit_blocks;
...@@ -143,6 +152,11 @@ inline GpuLaunchConfig GetGpuLaunchConfig1D(const phi::GPUContext& context, ...@@ -143,6 +152,11 @@ inline GpuLaunchConfig GetGpuLaunchConfig1D(const phi::GPUContext& context,
config.thread_per_block.x = threads; config.thread_per_block.x = threads;
config.block_per_grid.x = blocks; config.block_per_grid.x = blocks;
config.compute_capability = capability; config.compute_capability = capability;
VLOG(3) << "Get 1-D launch config: numel=" << numel
<< ", vec_size=" << vec_size << ", block_size=" << threads
<< ", grid_size=" << blocks << ", limit_blocks=" << limit_blocks
<< ", limit_threads=" << limit_threads;
return config; return config;
} }
...@@ -163,19 +177,18 @@ inline GpuLaunchConfig GetGpuLaunchConfig2D(const phi::GPUContext& context, ...@@ -163,19 +177,18 @@ inline GpuLaunchConfig GetGpuLaunchConfig2D(const phi::GPUContext& context,
y_dim)); y_dim));
const int kThreadsPerBlock = 256; const int kThreadsPerBlock = 256;
int block_cols = (std::min)(x_dim, kThreadsPerBlock); int block_cols = std::min(x_dim, kThreadsPerBlock);
int block_rows = (std::max)(kThreadsPerBlock / block_cols, 1); int block_rows = std::max(kThreadsPerBlock / block_cols, 1);
int max_physical_threads = context.GetMaxPhysicalThreadCount(); int max_physical_threads = context.GetMaxPhysicalThreadCount();
const int max_blocks = (std::max)(max_physical_threads / kThreadsPerBlock, 1); const int max_blocks = std::max(max_physical_threads / kThreadsPerBlock, 1);
GpuLaunchConfig config; GpuLaunchConfig config;
// Noticed, block size is not align to 32, if needed do it yourself. // Noticed, block size is not align to 32, if needed do it yourself.
config.thread_per_block = dim3(block_cols, block_rows, 1); config.thread_per_block = dim3(block_cols, block_rows, 1);
int grid_x = (std::min)(DivUp(x_dim, block_cols), max_blocks); int grid_x = std::min(DivUp<int>(x_dim, block_cols), max_blocks);
int grid_y = int grid_y = std::min(max_blocks / grid_x, std::max(y_dim / block_rows, 1));
(std::min)(max_blocks / grid_x, (std::max)(y_dim / block_rows, 1));
config.block_per_grid = dim3(grid_x, grid_y, 1); config.block_per_grid = dim3(grid_x, grid_y, 1);
return config; return config;
...@@ -202,13 +215,10 @@ inline GpuLaunchConfig GetGpuLaunchConfig3D(const phi::GPUContext& context, ...@@ -202,13 +215,10 @@ inline GpuLaunchConfig GetGpuLaunchConfig3D(const phi::GPUContext& context,
int block_y = std::min(GetLastPow2(height), max_threads / block_x); int block_y = std::min(GetLastPow2(height), max_threads / block_x);
int block_z = std::min(num_img, max_threads / block_x / block_y); int block_z = std::min(num_img, max_threads / block_x / block_y);
auto max_grid_dim = context.GetCUDAMaxGridDimSize(); std::array<int, 3> max_grid_dim = context.GetCUDAMaxGridDimSize();
int grid_x = int grid_x = std::min(max_grid_dim[0], DivUp<int>(width, block_x));
std::min<int>(max_grid_dim[0], backends::gpu::DivUp(width, block_x)); int grid_y = std::min(max_grid_dim[1], DivUp<int>(height, block_y));
int grid_y = int grid_z = std::min(max_grid_dim[2], DivUp<int>(num_img, block_z * 4));
std::min<int>(max_grid_dim[1], backends::gpu::DivUp(height, block_y));
int grid_z = std::min<int>(max_grid_dim[2],
backends::gpu::DivUp(num_img, block_z * 4));
const int capability = context.GetComputeCapability(); const int capability = context.GetComputeCapability();
GpuLaunchConfig config; GpuLaunchConfig config;
......
...@@ -511,8 +511,8 @@ struct Loader { ...@@ -511,8 +511,8 @@ struct Loader {
template <typename Array, typename ArgsT> template <typename Array, typename ArgsT>
static __device__ void Apply(const Array &in, static __device__ void Apply(const Array &in,
ArgsT *args, ArgsT *args,
kps::IndexType offset,
int num, int num,
int data_offset,
int read_lens, int read_lens,
bool is_boundary) { bool is_boundary) {
using Type = std::tuple_element_t<Index, ArgsT>; using Type = std::tuple_element_t<Index, ArgsT>;
...@@ -521,13 +521,13 @@ struct Loader { ...@@ -521,13 +521,13 @@ struct Loader {
if (is_boundary) { if (is_boundary) {
kps::ReadData<Type, VecSize, 1, 1, ArgsT, Index, true>( kps::ReadData<Type, VecSize, 1, 1, ArgsT, Index, true>(
args, args,
reinterpret_cast<const _ptr_ Type *>(in[Index]) + data_offset, reinterpret_cast<const _ptr_ Type *>(in[Index]) + offset,
num, num,
read_lens); read_lens);
} else { } else {
kps::ReadData<Type, VecSize, 1, 1, ArgsT, Index, false>( kps::ReadData<Type, VecSize, 1, 1, ArgsT, Index, false>(
args, args,
reinterpret_cast<const _ptr_ Type *>(in[Index]) + data_offset, reinterpret_cast<const _ptr_ Type *>(in[Index]) + offset,
num, num,
read_lens); read_lens);
} }
...@@ -681,46 +681,12 @@ struct SameDimsElementwisePrimitiveCaller { ...@@ -681,46 +681,12 @@ struct SameDimsElementwisePrimitiveCaller {
} }
}; };
template <typename OutT, int VecSize, bool IsBoundary, int NumOuts>
struct ElementwiseWriteDataCaller {
__device__ __forceinline__ void operator()(
phi::Array<_ptr_ OutT *, NumOuts> outs,
ConditionalT<OutT, NumOuts> src[VecSize],
int block_offset,
int num) {
OutT dst[NumOuts][VecSize];
#pragma unroll
for (int i = 0; i < VecSize; ++i) {
#pragma unroll
for (int j = 0; j < NumOuts; ++j) {
dst[j][i] = (src[i])[j];
}
}
#pragma unroll
for (int i = 0; i < NumOuts; ++i) {
kps::WriteData<OutT, VecSize, 1, 1, IsBoundary>(
outs[i] + block_offset, dst[i], num);
}
}
};
template <typename OutT, int VecSize, bool IsBoundary>
struct ElementwiseWriteDataCaller<OutT, VecSize, IsBoundary, 1> {
__device__ __forceinline__ void operator()(phi::Array<_ptr_ OutT *, 1> outs,
OutT src[VecSize],
int block_offset,
int num) {
kps::WriteData<OutT, VecSize, 1, 1, IsBoundary>(
outs[0] + block_offset, src, num);
}
};
template <typename OutT, int VecSize, bool IsBoundary, int NumOuts> template <typename OutT, int VecSize, bool IsBoundary, int NumOuts>
struct ElementwiseWriteDataCallerBc { struct ElementwiseWriteDataCallerBc {
__device__ __forceinline__ void operator()( __device__ __forceinline__ void operator()(
phi::Array<_ptr_ OutT *, NumOuts> outs, phi::Array<_ptr_ OutT *, NumOuts> outs,
ConditionalT<OutT, NumOuts> src[VecSize], ConditionalT<OutT, NumOuts> src[VecSize],
int block_offset, kps::IndexType block_offset,
int num, int num,
int read_lens) { int read_lens) {
OutT dst[NumOuts][VecSize]; OutT dst[NumOuts][VecSize];
...@@ -743,7 +709,7 @@ template <typename OutT, int VecSize, bool IsBoundary> ...@@ -743,7 +709,7 @@ template <typename OutT, int VecSize, bool IsBoundary>
struct ElementwiseWriteDataCallerBc<OutT, VecSize, IsBoundary, 1> { struct ElementwiseWriteDataCallerBc<OutT, VecSize, IsBoundary, 1> {
__device__ __forceinline__ void operator()(phi::Array<_ptr_ OutT *, 1> outs, __device__ __forceinline__ void operator()(phi::Array<_ptr_ OutT *, 1> outs,
OutT src[VecSize], OutT src[VecSize],
int block_offset, kps::IndexType block_offset,
int num, int num,
int read_lens) { int read_lens) {
kps::WriteData<OutT, VecSize, 1, 1, IsBoundary>( kps::WriteData<OutT, VecSize, 1, 1, IsBoundary>(
...@@ -758,11 +724,10 @@ template <typename OutT, ...@@ -758,11 +724,10 @@ template <typename OutT,
int VecSize, int VecSize,
bool IsBoundary> bool IsBoundary>
__device__ void VectorizedElementwiseKernelImpl( __device__ void VectorizedElementwiseKernelImpl(
const phi::Array<const _ptr_ char *__restrict__, Arity> &in, const phi::Array<const _ptr_ char *__restrict__, Arity> &in,
phi::Array<_ptr_ OutT *, NumOuts> outs, phi::Array<_ptr_ OutT *, NumOuts> outs,
kps::IndexType offset,
int num, int num,
int data_offset,
int read_lens, int read_lens,
Functor func) { Functor func) {
using Traits = paddle::platform::FunctionTraits<Functor>; using Traits = paddle::platform::FunctionTraits<Functor>;
...@@ -771,7 +736,7 @@ __device__ void VectorizedElementwiseKernelImpl( ...@@ -771,7 +736,7 @@ __device__ void VectorizedElementwiseKernelImpl(
ConditionalT<OutT, NumOuts> result[VecSize]; ConditionalT<OutT, NumOuts> result[VecSize];
Unroller<Loader, VecSize, Arity>::step( Unroller<Loader, VecSize, Arity>::step(
in, args, num, data_offset, read_lens, IsBoundary); in, args, offset, num, read_lens, IsBoundary);
SameDimsElementwisePrimitiveCaller<ConditionalT<OutT, NumOuts>, SameDimsElementwisePrimitiveCaller<ConditionalT<OutT, NumOuts>,
VecSize, VecSize,
...@@ -780,19 +745,19 @@ __device__ void VectorizedElementwiseKernelImpl( ...@@ -780,19 +745,19 @@ __device__ void VectorizedElementwiseKernelImpl(
Arity>()(func, args, result, read_lens); Arity>()(func, args, result, read_lens);
ElementwiseWriteDataCallerBc<OutT, VecSize, IsBoundary, NumOuts>()( ElementwiseWriteDataCallerBc<OutT, VecSize, IsBoundary, NumOuts>()(
outs, result, data_offset, num, read_lens); outs, result, offset, num, read_lens);
} }
template <typename OutT, typename Functor, int Arity, int NumOuts, int VecSize> template <typename OutT, typename Functor, int Arity, int NumOuts, int VecSize>
__global__ void VectorizedElementwiseKernel( __global__ void VectorizedElementwiseKernel(
phi::Array<const _ptr_ char *__restrict__, Arity> ins, phi::Array<const _ptr_ char *__restrict__, Arity> ins,
phi::Array<_ptr_ OutT *, NumOuts> outs, phi::Array<_ptr_ OutT *, NumOuts> outs,
int size, kps::IndexType numel,
int main_offset, kps::IndexType main_offset,
int read_lens, int read_lens,
Functor func) { Functor func) {
int data_offset = BLOCK_ID_X * BLOCK_NUM_X * read_lens; kps::IndexType data_offset = BLOCK_ID_X * BLOCK_NUM_X * read_lens;
int stride = BLOCK_NUM_X * GRID_NUM_X * read_lens; kps::IndexType stride = BLOCK_NUM_X * GRID_NUM_X * read_lens;
for (; data_offset < main_offset; data_offset += stride) { for (; data_offset < main_offset; data_offset += stride) {
VectorizedElementwiseKernelImpl<OutT, VectorizedElementwiseKernelImpl<OutT,
Functor, Functor,
...@@ -800,29 +765,31 @@ __global__ void VectorizedElementwiseKernel( ...@@ -800,29 +765,31 @@ __global__ void VectorizedElementwiseKernel(
NumOuts, NumOuts,
VecSize, VecSize,
false>( false>(
ins, outs, read_lens * BLOCK_NUM_X, data_offset, read_lens, func); ins, outs, data_offset, read_lens * BLOCK_NUM_X, read_lens, func);
} }
int num = size - data_offset; int remain = numel - data_offset;
if (num > 0) { if (remain > 0) {
VectorizedElementwiseKernelImpl<OutT, VectorizedElementwiseKernelImpl<OutT,
Functor, Functor,
Arity, Arity,
NumOuts, NumOuts,
VecSize, VecSize,
true>( true>(
ins, outs, num, data_offset, read_lens, func); ins, outs, data_offset, remain, read_lens, func);
} }
} }
template <typename OutT, typename Functor, int Arity, int NumOuts, int VecSize> template <typename OutT, typename Functor, int Arity, int NumOuts, int VecSize>
void ElementwiseCudaKernel(const KPDevice &ctx, void LaunchElementwiseCudaKernel(const KPDevice &ctx,
const std::vector<const DenseTensor *> &ins, const std::vector<const DenseTensor *> &ins,
std::vector<DenseTensor *> *outs, std::vector<DenseTensor *> *outs,
int read_lens, int read_lens,
Functor func) { Functor func) {
auto numel = // There are at least 1 output, but maybe 0 input (ins.size() == 0).
(*outs)[0]->numel(); // To avoid running errors when ins.size()== 0 // For large tensor numel * sizeof(T) > 2^31, we must use int64_t as index
// type.
int64_t numel = (*outs)[0]->numel();
phi::Array<const _ptr_ char *__restrict__, Arity> ins_data; phi::Array<const _ptr_ char *__restrict__, Arity> ins_data;
phi::Array<_ptr_ OutT *, NumOuts> outs_data; phi::Array<_ptr_ OutT *, NumOuts> outs_data;
...@@ -834,15 +801,16 @@ void ElementwiseCudaKernel(const KPDevice &ctx, ...@@ -834,15 +801,16 @@ void ElementwiseCudaKernel(const KPDevice &ctx,
int block_size = 64; int block_size = 64;
int grid_size = 8; int grid_size = 8;
auto stream = ctx.x_context()->xpu_stream; auto stream = ctx.x_context()->xpu_stream;
int main_offset = (numel / (read_lens * block_size)) * read_lens * block_size; int64_t main_offset =
(numel / (read_lens * block_size)) * read_lens * block_size;
VectorizedElementwiseKernel<OutT, Functor, Arity, NumOuts, VecSize> VectorizedElementwiseKernel<OutT, Functor, Arity, NumOuts, VecSize>
<<<grid_size, block_size, 0, stream>>>( <<<grid_size, block_size, 0, stream>>>(
ins_data, outs_data, numel, main_offset, read_lens, func); ins_data, outs_data, numel, main_offset, read_lens, func);
#else #else
auto gpu_config = auto gpu_config =
phi::backends::gpu::GetGpuLaunchConfig1D(ctx, numel, VecSize); phi::backends::gpu::GetGpuLaunchConfig1D(ctx, numel, VecSize);
int main_offset = (numel / (VecSize * gpu_config.GetBlockSize())) * VecSize * int64_t main_offset = (numel / (VecSize * gpu_config.GetBlockSize())) *
gpu_config.GetBlockSize(); VecSize * gpu_config.GetBlockSize();
auto stream = ctx.stream(); auto stream = ctx.stream();
VectorizedElementwiseKernel<OutT, Functor, Arity, NumOuts, VecSize> VectorizedElementwiseKernel<OutT, Functor, Arity, NumOuts, VecSize>
<<<gpu_config.block_per_grid, gpu_config.thread_per_block, 0, stream>>>( <<<gpu_config.block_per_grid, gpu_config.thread_per_block, 0, stream>>>(
...@@ -901,15 +869,15 @@ void ElementwiseKernel(const KPDevice &ctx, ...@@ -901,15 +869,15 @@ void ElementwiseKernel(const KPDevice &ctx,
#endif #endif
switch (vec_size) { switch (vec_size) {
case VecSizeL: case VecSizeL:
ElementwiseCudaKernel<OutT, Functor, kArity, NumOuts, VecSizeL>( LaunchElementwiseCudaKernel<OutT, Functor, kArity, NumOuts, VecSizeL>(
ctx, ins, outs, read_lens, func); ctx, ins, outs, read_lens, func);
break; break;
case VecSizeM: case VecSizeM:
ElementwiseCudaKernel<OutT, Functor, kArity, NumOuts, VecSizeM>( LaunchElementwiseCudaKernel<OutT, Functor, kArity, NumOuts, VecSizeM>(
ctx, ins, outs, read_lens, func); ctx, ins, outs, read_lens, func);
break; break;
case VecSizeS: case VecSizeS:
ElementwiseCudaKernel<OutT, Functor, kArity, NumOuts, VecSizeS>( LaunchElementwiseCudaKernel<OutT, Functor, kArity, NumOuts, VecSizeS>(
ctx, ins, outs, read_lens, func); ctx, ins, outs, read_lens, func);
break; break;
default: { default: {
......
...@@ -87,3 +87,16 @@ ...@@ -87,3 +87,16 @@
#include "paddle/phi/kernels/primitive/functor_primitives.h" #include "paddle/phi/kernels/primitive/functor_primitives.h"
#endif #endif
namespace phi {
namespace kps {
#ifdef PADDLE_WITH_XPU_KP
// The type of index used in kernel
using IndexType = int;
#else
using IndexType = int64_t;
#endif
} // namespace kps
} // namespace phi
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册