未验证 提交 a1dbee23 编写于 作者: S sneaxiy 提交者: GitHub

fix some op int32 exceed range (#45711)

上级 ea50282b
...@@ -70,9 +70,10 @@ namespace platform { ...@@ -70,9 +70,10 @@ namespace platform {
* *
*/ */
#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type) \ #define CUDA_KERNEL_LOOP_TYPE(i, num, index_type) \
int64_t __index__ = blockIdx.x * blockDim.x + threadIdx.x; \ int64_t __index__ = \
for (index_type i = __index__; __index__ < (num); \ static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x; \
for (index_type i = __index__; __index__ < (num); \
__index__ += blockDim.x * gridDim.x, i = __index__) __index__ += blockDim.x * gridDim.x, i = __index__)
class CublasHandleHolder { class CublasHandleHolder {
......
...@@ -44,7 +44,10 @@ ...@@ -44,7 +44,10 @@
namespace paddle { namespace paddle {
namespace platform { namespace platform {
inline int DivUp(int a, int b) { return (a + b - 1) / b; } template <typename T = int>
inline T DivUp(T a, T b) {
return (a + b - 1) / b;
}
/* https://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2 /* https://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
for round integer value into next highest power of 2. */ for round integer value into next highest power of 2. */
...@@ -120,7 +123,7 @@ inline GpuLaunchConfig GetGpuLaunchConfig1D(const phi::GPUContext& context, ...@@ -120,7 +123,7 @@ inline GpuLaunchConfig GetGpuLaunchConfig1D(const phi::GPUContext& context,
#endif #endif
int threads = limit_threads; int threads = limit_threads;
int sm_count = context.GetSMCount(); int sm_count = context.GetSMCount();
int active_threads_num = numel / vec_size; int64_t active_threads_num = numel / vec_size;
if (active_threads_num / (sm_count << 1) < limit_threads) { if (active_threads_num / (sm_count << 1) < limit_threads) {
// Round up threads number into an exponential multiple of 2, while number // Round up threads number into an exponential multiple of 2, while number
// of acitve blocks is about twice of SM, to acquire better performance. // of acitve blocks is about twice of SM, to acquire better performance.
...@@ -132,7 +135,7 @@ inline GpuLaunchConfig GetGpuLaunchConfig1D(const phi::GPUContext& context, ...@@ -132,7 +135,7 @@ inline GpuLaunchConfig GetGpuLaunchConfig1D(const phi::GPUContext& context,
} }
// Number of threads per block shall be larger than 64. // Number of threads per block shall be larger than 64.
threads = std::max(64, threads); threads = std::max(64, threads);
int blocks = DivUp(DivUp(numel, vec_size), threads); int64_t blocks = DivUp<int64_t>(DivUp<int64_t>(numel, vec_size), threads);
int limit_blocks = context.GetCUDAMaxGridDimSize()[0]; int limit_blocks = context.GetCUDAMaxGridDimSize()[0];
if (blocks > limit_blocks) { if (blocks > limit_blocks) {
blocks = limit_blocks; blocks = limit_blocks;
...@@ -146,8 +149,8 @@ inline GpuLaunchConfig GetGpuLaunchConfig1D(const phi::GPUContext& context, ...@@ -146,8 +149,8 @@ inline GpuLaunchConfig GetGpuLaunchConfig1D(const phi::GPUContext& context,
} }
inline GpuLaunchConfig GetGpuLaunchConfig2D(const phi::GPUContext& context, inline GpuLaunchConfig GetGpuLaunchConfig2D(const phi::GPUContext& context,
int x_dim, int64_t x_dim,
int y_dim) { int64_t y_dim) {
PADDLE_ENFORCE_GT( PADDLE_ENFORCE_GT(
x_dim, x_dim,
0, 0,
...@@ -162,8 +165,10 @@ inline GpuLaunchConfig GetGpuLaunchConfig2D(const phi::GPUContext& context, ...@@ -162,8 +165,10 @@ inline GpuLaunchConfig GetGpuLaunchConfig2D(const phi::GPUContext& context,
y_dim)); y_dim));
const int kThreadsPerBlock = 256; const int kThreadsPerBlock = 256;
int block_cols = (std::min)(x_dim, kThreadsPerBlock); // NOTE(zengjinle): cast std::min<int64_t> result to int is safe here, because
int block_rows = (std::max)(kThreadsPerBlock / block_cols, 1); // kThreadsPerBlock is always very small.
int block_cols = std::min<int64_t>(x_dim, kThreadsPerBlock);
int block_rows = std::max<int64_t>(kThreadsPerBlock / block_cols, 1);
int max_physical_threads = context.GetMaxPhysicalThreadCount(); int max_physical_threads = context.GetMaxPhysicalThreadCount();
const int max_blocks = (std::max)(max_physical_threads / kThreadsPerBlock, 1); const int max_blocks = (std::max)(max_physical_threads / kThreadsPerBlock, 1);
...@@ -172,9 +177,9 @@ inline GpuLaunchConfig GetGpuLaunchConfig2D(const phi::GPUContext& context, ...@@ -172,9 +177,9 @@ inline GpuLaunchConfig GetGpuLaunchConfig2D(const phi::GPUContext& context,
// Noticed, block size is not align to 32, if needed do it yourself. // Noticed, block size is not align to 32, if needed do it yourself.
config.thread_per_block = dim3(block_cols, block_rows, 1); config.thread_per_block = dim3(block_cols, block_rows, 1);
int grid_x = (std::min)(DivUp(x_dim, block_cols), max_blocks); int grid_x = std::min<int64_t>(DivUp<int64_t>(x_dim, block_cols), max_blocks);
int grid_y = int grid_y = std::min<int64_t>(max_blocks / grid_x,
(std::min)(max_blocks / grid_x, (std::max)(y_dim / block_rows, 1)); std::max<int64_t>(y_dim / block_rows, 1));
config.block_per_grid = dim3(grid_x, grid_y, 1); config.block_per_grid = dim3(grid_x, grid_y, 1);
return config; return config;
......
...@@ -67,9 +67,10 @@ namespace platform { ...@@ -67,9 +67,10 @@ namespace platform {
* *
*/ */
#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type) \ #define CUDA_KERNEL_LOOP_TYPE(i, num, index_type) \
int64_t __index__ = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; \ int64_t __index__ = \
for (index_type i = __index__; __index__ < (num); \ static_cast<int64_t>(hipBlockIdx_x) * hipBlockDim_x + hipThreadIdx_x; \
for (index_type i = __index__; __index__ < (num); \
__index__ += hipBlockDim_x * hipGridDim_x, i = __index__) __index__ += hipBlockDim_x * hipGridDim_x, i = __index__)
class CublasHandleHolder { class CublasHandleHolder {
......
...@@ -62,9 +62,10 @@ namespace gpu { ...@@ -62,9 +62,10 @@ namespace gpu {
* *
*/ */
#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type) \ #define CUDA_KERNEL_LOOP_TYPE(i, num, index_type) \
int64_t __index__ = blockIdx.x * blockDim.x + threadIdx.x; \ int64_t __index__ = \
for (index_type i = __index__; __index__ < (num); \ static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x; \
for (index_type i = __index__; __index__ < (num); \
__index__ += blockDim.x * gridDim.x, i = __index__) __index__ += blockDim.x * gridDim.x, i = __index__)
} // namespace gpu } // namespace gpu
......
...@@ -162,8 +162,8 @@ inline GpuLaunchConfig GetGpuLaunchConfig1D(const phi::GPUContext& context, ...@@ -162,8 +162,8 @@ inline GpuLaunchConfig GetGpuLaunchConfig1D(const phi::GPUContext& context,
} }
inline GpuLaunchConfig GetGpuLaunchConfig2D(const phi::GPUContext& context, inline GpuLaunchConfig GetGpuLaunchConfig2D(const phi::GPUContext& context,
int x_dim, int64_t x_dim,
int y_dim) { int64_t y_dim) {
PADDLE_ENFORCE_GT( PADDLE_ENFORCE_GT(
x_dim, x_dim,
0, 0,
...@@ -178,7 +178,7 @@ inline GpuLaunchConfig GetGpuLaunchConfig2D(const phi::GPUContext& context, ...@@ -178,7 +178,7 @@ inline GpuLaunchConfig GetGpuLaunchConfig2D(const phi::GPUContext& context,
y_dim)); y_dim));
const int kThreadsPerBlock = 256; const int kThreadsPerBlock = 256;
int block_cols = std::min(x_dim, kThreadsPerBlock); int block_cols = std::min<int64_t>(x_dim, kThreadsPerBlock);
int block_rows = std::max(kThreadsPerBlock / block_cols, 1); int block_rows = std::max(kThreadsPerBlock / block_cols, 1);
int max_physical_threads = context.GetMaxPhysicalThreadCount(); int max_physical_threads = context.GetMaxPhysicalThreadCount();
...@@ -188,8 +188,9 @@ inline GpuLaunchConfig GetGpuLaunchConfig2D(const phi::GPUContext& context, ...@@ -188,8 +188,9 @@ inline GpuLaunchConfig GetGpuLaunchConfig2D(const phi::GPUContext& context,
// Noticed, block size is not align to 32, if needed do it yourself. // Noticed, block size is not align to 32, if needed do it yourself.
config.thread_per_block = dim3(block_cols, block_rows, 1); config.thread_per_block = dim3(block_cols, block_rows, 1);
int grid_x = std::min(DivUp<int>(x_dim, block_cols), max_blocks); int grid_x = std::min<int64_t>(DivUp<int64_t>(x_dim, block_cols), max_blocks);
int grid_y = std::min(max_blocks / grid_x, std::max(y_dim / block_rows, 1)); int grid_y = std::min<int64_t>(max_blocks / grid_x,
std::max<int64_t>(y_dim / block_rows, 1));
config.block_per_grid = dim3(grid_x, grid_y, 1); config.block_per_grid = dim3(grid_x, grid_y, 1);
return config; return config;
......
...@@ -62,9 +62,10 @@ namespace gpu { ...@@ -62,9 +62,10 @@ namespace gpu {
* *
*/ */
#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type) \ #define CUDA_KERNEL_LOOP_TYPE(i, num, index_type) \
int64_t __index__ = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; \ int64_t __index__ = \
for (index_type i = __index__; __index__ < (num); \ static_cast<int64_t>(hipBlockIdx_x) * hipBlockDim_x + hipThreadIdx_x; \
for (index_type i = __index__; __index__ < (num); \
__index__ += hipBlockDim_x * hipGridDim_x, i = __index__) __index__ += hipBlockDim_x * hipGridDim_x, i = __index__)
} // namespace gpu } // namespace gpu
......
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/math_function.h"
...@@ -28,8 +29,7 @@ __global__ void FillOutputKernel(const InT* p_in_data, ...@@ -28,8 +29,7 @@ __global__ void FillOutputKernel(const InT* p_in_data,
OutT* p_out_data, OutT* p_out_data,
const int64_t numel, const int64_t numel,
const int depth) { const int depth) {
int idx = blockIdx.x * blockDim.x + threadIdx.x; CUDA_KERNEL_LOOP_TYPE(idx, numel, int64_t) {
if (idx < numel) {
PADDLE_ENFORCE(p_in_data[idx] >= 0 && p_in_data[idx] < depth, PADDLE_ENFORCE(p_in_data[idx] >= 0 && p_in_data[idx] < depth,
"Illegal index value, Input(input) value should be " "Illegal index value, Input(input) value should be "
"greater than or equal to 0, and less than depth [%d], " "greater than or equal to 0, and less than depth [%d], "
...@@ -62,9 +62,10 @@ struct OneHotV2OpCUDAFunctor { ...@@ -62,9 +62,10 @@ struct OneHotV2OpCUDAFunctor {
auto stream = ctx_.stream(); auto stream = ctx_.stream();
funcs::set_constant(ctx_, out_, 0.0); funcs::set_constant(ctx_, out_, 0.0);
FillOutputKernel<<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / auto config = phi::backends::gpu::GetGpuLaunchConfig1D(ctx_, numel);
PADDLE_CUDA_NUM_THREADS,
PADDLE_CUDA_NUM_THREADS, FillOutputKernel<<<config.block_per_grid,
config.thread_per_block,
0, 0,
stream>>>(p_in_data, p_out_data, numel, depth_); stream>>>(p_in_data, p_out_data, numel, depth_);
} }
......
...@@ -23,20 +23,23 @@ namespace phi { ...@@ -23,20 +23,23 @@ namespace phi {
template <typename T, typename IntType> template <typename T, typename IntType>
__global__ void StackCUDAKernel(T** input_ptrs, __global__ void StackCUDAKernel(T** input_ptrs,
int split_size, IntType split_size,
int rows, IntType rows,
int cols, IntType cols,
T* __restrict__ output) { T* __restrict__ output) {
IntType grid_x = blockIdx.x * blockDim.x + threadIdx.x; IntType grid_x = static_cast<IntType>(blockIdx.x) * blockDim.x + threadIdx.x;
IntType grid_x_stride = static_cast<IntType>(blockDim.x) * gridDim.x;
IntType grid_y_stride = static_cast<IntType>(blockDim.y) * gridDim.y;
for (; grid_x < cols; grid_x += blockDim.x * gridDim.x) { for (; grid_x < cols; grid_x += grid_x_stride) {
IntType grid_y = blockIdx.y * blockDim.y + threadIdx.y; IntType grid_y =
static_cast<IntType>(blockIdx.y) * blockDim.y + threadIdx.y;
IntType split = grid_x / split_size; IntType split = grid_x / split_size;
const T* input_ptr = input_ptrs[split]; const T* input_ptr = input_ptrs[split];
IntType col_offset = grid_x % split_size; IntType col_offset = grid_x % split_size;
#pragma unroll #pragma unroll
for (; grid_y < rows; grid_y += blockDim.y * gridDim.y) { for (; grid_y < rows; grid_y += grid_y_stride) {
output[grid_y * cols + grid_x] = output[grid_y * cols + grid_x] =
input_ptr[grid_y * split_size + col_offset]; input_ptr[grid_y * split_size + col_offset];
} }
...@@ -69,12 +72,12 @@ void StackKernel(const Context& dev_ctx, ...@@ -69,12 +72,12 @@ void StackKernel(const Context& dev_ctx,
dev_ctx.stream()); dev_ctx.stream());
// Split x dim from axis to matrix // Split x dim from axis to matrix
int x_row = 1, x_col = 1; int64_t x_row = 1, x_col = 1;
for (int i = 0; i < axis; ++i) { for (int i = 0; i < axis; ++i) {
x_row *= x[0]->dims()[i]; x_row *= x[0]->dims()[i];
} }
x_col = x[0]->numel() / x_row; x_col = x[0]->numel() / x_row;
int out_col = x_col * n; int64_t out_col = x_col * n;
auto config = auto config =
phi::backends::gpu::GetGpuLaunchConfig2D(dev_ctx, out_col, x_row); phi::backends::gpu::GetGpuLaunchConfig2D(dev_ctx, out_col, x_row);
...@@ -85,9 +88,9 @@ void StackKernel(const Context& dev_ctx, ...@@ -85,9 +88,9 @@ void StackKernel(const Context& dev_ctx,
config.thread_per_block, config.thread_per_block,
0, 0,
dev_ctx.stream()>>>(reinterpret_cast<T**>(tmp_x_data->ptr()), dev_ctx.stream()>>>(reinterpret_cast<T**>(tmp_x_data->ptr()),
x_col, static_cast<int32_t>(x_col),
x_row, static_cast<int32_t>(x_row),
out_col, static_cast<int32_t>(out_col),
y_data); y_data);
} else { } else {
StackCUDAKernel<T, int64_t> StackCUDAKernel<T, int64_t>
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册