未验证 提交 a1dbee23 编写于 作者: S sneaxiy 提交者: GitHub

fix some op int32 exceed range (#45711)

上级 ea50282b
......@@ -70,9 +70,10 @@ namespace platform {
*
*/
#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type) \
int64_t __index__ = blockIdx.x * blockDim.x + threadIdx.x; \
for (index_type i = __index__; __index__ < (num); \
#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type) \
int64_t __index__ = \
static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x; \
for (index_type i = __index__; __index__ < (num); \
__index__ += blockDim.x * gridDim.x, i = __index__)
class CublasHandleHolder {
......
......@@ -44,7 +44,10 @@
namespace paddle {
namespace platform {
inline int DivUp(int a, int b) { return (a + b - 1) / b; }
template <typename T = int>
inline T DivUp(T a, T b) {
return (a + b - 1) / b;
}
/* https://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
for round integer value into next highest power of 2. */
......@@ -120,7 +123,7 @@ inline GpuLaunchConfig GetGpuLaunchConfig1D(const phi::GPUContext& context,
#endif
int threads = limit_threads;
int sm_count = context.GetSMCount();
int active_threads_num = numel / vec_size;
int64_t active_threads_num = numel / vec_size;
if (active_threads_num / (sm_count << 1) < limit_threads) {
// Round up threads number into an exponential multiple of 2, while number
// of acitve blocks is about twice of SM, to acquire better performance.
......@@ -132,7 +135,7 @@ inline GpuLaunchConfig GetGpuLaunchConfig1D(const phi::GPUContext& context,
}
// Number of threads per block shall be larger than 64.
threads = std::max(64, threads);
int blocks = DivUp(DivUp(numel, vec_size), threads);
int64_t blocks = DivUp<int64_t>(DivUp<int64_t>(numel, vec_size), threads);
int limit_blocks = context.GetCUDAMaxGridDimSize()[0];
if (blocks > limit_blocks) {
blocks = limit_blocks;
......@@ -146,8 +149,8 @@ inline GpuLaunchConfig GetGpuLaunchConfig1D(const phi::GPUContext& context,
}
inline GpuLaunchConfig GetGpuLaunchConfig2D(const phi::GPUContext& context,
int x_dim,
int y_dim) {
int64_t x_dim,
int64_t y_dim) {
PADDLE_ENFORCE_GT(
x_dim,
0,
......@@ -162,8 +165,10 @@ inline GpuLaunchConfig GetGpuLaunchConfig2D(const phi::GPUContext& context,
y_dim));
const int kThreadsPerBlock = 256;
int block_cols = (std::min)(x_dim, kThreadsPerBlock);
int block_rows = (std::max)(kThreadsPerBlock / block_cols, 1);
// NOTE(zengjinle): cast std::min<int64_t> result to int is safe here, because
// kThreadsPerBlock is always very small.
int block_cols = std::min<int64_t>(x_dim, kThreadsPerBlock);
int block_rows = std::max<int64_t>(kThreadsPerBlock / block_cols, 1);
int max_physical_threads = context.GetMaxPhysicalThreadCount();
const int max_blocks = (std::max)(max_physical_threads / kThreadsPerBlock, 1);
......@@ -172,9 +177,9 @@ inline GpuLaunchConfig GetGpuLaunchConfig2D(const phi::GPUContext& context,
// Noticed, block size is not align to 32, if needed do it yourself.
config.thread_per_block = dim3(block_cols, block_rows, 1);
int grid_x = (std::min)(DivUp(x_dim, block_cols), max_blocks);
int grid_y =
(std::min)(max_blocks / grid_x, (std::max)(y_dim / block_rows, 1));
int grid_x = std::min<int64_t>(DivUp<int64_t>(x_dim, block_cols), max_blocks);
int grid_y = std::min<int64_t>(max_blocks / grid_x,
std::max<int64_t>(y_dim / block_rows, 1));
config.block_per_grid = dim3(grid_x, grid_y, 1);
return config;
......
......@@ -67,9 +67,10 @@ namespace platform {
*
*/
#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type) \
int64_t __index__ = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; \
for (index_type i = __index__; __index__ < (num); \
#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type) \
int64_t __index__ = \
static_cast<int64_t>(hipBlockIdx_x) * hipBlockDim_x + hipThreadIdx_x; \
for (index_type i = __index__; __index__ < (num); \
__index__ += hipBlockDim_x * hipGridDim_x, i = __index__)
class CublasHandleHolder {
......
......@@ -62,9 +62,10 @@ namespace gpu {
*
*/
#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type) \
int64_t __index__ = blockIdx.x * blockDim.x + threadIdx.x; \
for (index_type i = __index__; __index__ < (num); \
#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type) \
int64_t __index__ = \
static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x; \
for (index_type i = __index__; __index__ < (num); \
__index__ += blockDim.x * gridDim.x, i = __index__)
} // namespace gpu
......
......@@ -162,8 +162,8 @@ inline GpuLaunchConfig GetGpuLaunchConfig1D(const phi::GPUContext& context,
}
inline GpuLaunchConfig GetGpuLaunchConfig2D(const phi::GPUContext& context,
int x_dim,
int y_dim) {
int64_t x_dim,
int64_t y_dim) {
PADDLE_ENFORCE_GT(
x_dim,
0,
......@@ -178,7 +178,7 @@ inline GpuLaunchConfig GetGpuLaunchConfig2D(const phi::GPUContext& context,
y_dim));
const int kThreadsPerBlock = 256;
int block_cols = std::min(x_dim, kThreadsPerBlock);
int block_cols = std::min<int64_t>(x_dim, kThreadsPerBlock);
int block_rows = std::max(kThreadsPerBlock / block_cols, 1);
int max_physical_threads = context.GetMaxPhysicalThreadCount();
......@@ -188,8 +188,9 @@ inline GpuLaunchConfig GetGpuLaunchConfig2D(const phi::GPUContext& context,
// Noticed, block size is not align to 32, if needed do it yourself.
config.thread_per_block = dim3(block_cols, block_rows, 1);
int grid_x = std::min(DivUp<int>(x_dim, block_cols), max_blocks);
int grid_y = std::min(max_blocks / grid_x, std::max(y_dim / block_rows, 1));
int grid_x = std::min<int64_t>(DivUp<int64_t>(x_dim, block_cols), max_blocks);
int grid_y = std::min<int64_t>(max_blocks / grid_x,
std::max<int64_t>(y_dim / block_rows, 1));
config.block_per_grid = dim3(grid_x, grid_y, 1);
return config;
......
......@@ -62,9 +62,10 @@ namespace gpu {
*
*/
#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type) \
int64_t __index__ = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; \
for (index_type i = __index__; __index__ < (num); \
#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type) \
int64_t __index__ = \
static_cast<int64_t>(hipBlockIdx_x) * hipBlockDim_x + hipThreadIdx_x; \
for (index_type i = __index__; __index__ < (num); \
__index__ += hipBlockDim_x * hipGridDim_x, i = __index__)
} // namespace gpu
......
......@@ -16,6 +16,7 @@
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/math_function.h"
......@@ -28,8 +29,7 @@ __global__ void FillOutputKernel(const InT* p_in_data,
OutT* p_out_data,
const int64_t numel,
const int depth) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < numel) {
CUDA_KERNEL_LOOP_TYPE(idx, numel, int64_t) {
PADDLE_ENFORCE(p_in_data[idx] >= 0 && p_in_data[idx] < depth,
"Illegal index value, Input(input) value should be "
"greater than or equal to 0, and less than depth [%d], "
......@@ -62,9 +62,10 @@ struct OneHotV2OpCUDAFunctor {
auto stream = ctx_.stream();
funcs::set_constant(ctx_, out_, 0.0);
FillOutputKernel<<<(numel + PADDLE_CUDA_NUM_THREADS - 1) /
PADDLE_CUDA_NUM_THREADS,
PADDLE_CUDA_NUM_THREADS,
auto config = phi::backends::gpu::GetGpuLaunchConfig1D(ctx_, numel);
FillOutputKernel<<<config.block_per_grid,
config.thread_per_block,
0,
stream>>>(p_in_data, p_out_data, numel, depth_);
}
......
......@@ -23,20 +23,23 @@ namespace phi {
template <typename T, typename IntType>
__global__ void StackCUDAKernel(T** input_ptrs,
int split_size,
int rows,
int cols,
IntType split_size,
IntType rows,
IntType cols,
T* __restrict__ output) {
IntType grid_x = blockIdx.x * blockDim.x + threadIdx.x;
IntType grid_x = static_cast<IntType>(blockIdx.x) * blockDim.x + threadIdx.x;
IntType grid_x_stride = static_cast<IntType>(blockDim.x) * gridDim.x;
IntType grid_y_stride = static_cast<IntType>(blockDim.y) * gridDim.y;
for (; grid_x < cols; grid_x += blockDim.x * gridDim.x) {
IntType grid_y = blockIdx.y * blockDim.y + threadIdx.y;
for (; grid_x < cols; grid_x += grid_x_stride) {
IntType grid_y =
static_cast<IntType>(blockIdx.y) * blockDim.y + threadIdx.y;
IntType split = grid_x / split_size;
const T* input_ptr = input_ptrs[split];
IntType col_offset = grid_x % split_size;
#pragma unroll
for (; grid_y < rows; grid_y += blockDim.y * gridDim.y) {
for (; grid_y < rows; grid_y += grid_y_stride) {
output[grid_y * cols + grid_x] =
input_ptr[grid_y * split_size + col_offset];
}
......@@ -69,12 +72,12 @@ void StackKernel(const Context& dev_ctx,
dev_ctx.stream());
// Split x dim from axis to matrix
int x_row = 1, x_col = 1;
int64_t x_row = 1, x_col = 1;
for (int i = 0; i < axis; ++i) {
x_row *= x[0]->dims()[i];
}
x_col = x[0]->numel() / x_row;
int out_col = x_col * n;
int64_t out_col = x_col * n;
auto config =
phi::backends::gpu::GetGpuLaunchConfig2D(dev_ctx, out_col, x_row);
......@@ -85,9 +88,9 @@ void StackKernel(const Context& dev_ctx,
config.thread_per_block,
0,
dev_ctx.stream()>>>(reinterpret_cast<T**>(tmp_x_data->ptr()),
x_col,
x_row,
out_col,
static_cast<int32_t>(x_col),
static_cast<int32_t>(x_row),
static_cast<int32_t>(out_col),
y_data);
} else {
StackCUDAKernel<T, int64_t>
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册