Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
a1dbee23
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
a1dbee23
编写于
9月 05, 2022
作者:
S
sneaxiy
提交者:
GitHub
9月 05, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
fix some op int32 exceed range (#45711)
上级
ea50282b
变更
8
显示空白变更内容
内联
并排
Showing
8 changed file
with
58 addition
and
44 deletion
+58
-44
paddle/fluid/platform/device/gpu/cuda/cuda_helper.h
paddle/fluid/platform/device/gpu/cuda/cuda_helper.h
+4
-3
paddle/fluid/platform/device/gpu/gpu_launch_config.h
paddle/fluid/platform/device/gpu/gpu_launch_config.h
+15
-10
paddle/fluid/platform/device/gpu/rocm/rocm_helper.h
paddle/fluid/platform/device/gpu/rocm/rocm_helper.h
+4
-3
paddle/phi/backends/gpu/cuda/cuda_helper.h
paddle/phi/backends/gpu/cuda/cuda_helper.h
+4
-3
paddle/phi/backends/gpu/gpu_launch_config.h
paddle/phi/backends/gpu/gpu_launch_config.h
+6
-5
paddle/phi/backends/gpu/rocm/rocm_helper.h
paddle/phi/backends/gpu/rocm/rocm_helper.h
+4
-3
paddle/phi/kernels/gpu/one_hot_kernel.cu
paddle/phi/kernels/gpu/one_hot_kernel.cu
+6
-5
paddle/phi/kernels/gpu/stack_kernel.cu
paddle/phi/kernels/gpu/stack_kernel.cu
+15
-12
未找到文件。
paddle/fluid/platform/device/gpu/cuda/cuda_helper.h
浏览文件 @
a1dbee23
...
@@ -71,7 +71,8 @@ namespace platform {
...
@@ -71,7 +71,8 @@ namespace platform {
*/
*/
#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type) \
#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type) \
int64_t __index__ = blockIdx.x * blockDim.x + threadIdx.x; \
int64_t __index__ = \
static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x; \
for (index_type i = __index__; __index__ < (num); \
for (index_type i = __index__; __index__ < (num); \
__index__ += blockDim.x * gridDim.x, i = __index__)
__index__ += blockDim.x * gridDim.x, i = __index__)
...
...
paddle/fluid/platform/device/gpu/gpu_launch_config.h
浏览文件 @
a1dbee23
...
@@ -44,7 +44,10 @@
...
@@ -44,7 +44,10 @@
namespace
paddle
{
namespace
paddle
{
namespace
platform
{
namespace
platform
{
inline
int
DivUp
(
int
a
,
int
b
)
{
return
(
a
+
b
-
1
)
/
b
;
}
template
<
typename
T
=
int
>
inline
T
DivUp
(
T
a
,
T
b
)
{
return
(
a
+
b
-
1
)
/
b
;
}
/* https://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
/* https://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
for round integer value into next highest power of 2. */
for round integer value into next highest power of 2. */
...
@@ -120,7 +123,7 @@ inline GpuLaunchConfig GetGpuLaunchConfig1D(const phi::GPUContext& context,
...
@@ -120,7 +123,7 @@ inline GpuLaunchConfig GetGpuLaunchConfig1D(const phi::GPUContext& context,
#endif
#endif
int
threads
=
limit_threads
;
int
threads
=
limit_threads
;
int
sm_count
=
context
.
GetSMCount
();
int
sm_count
=
context
.
GetSMCount
();
int
active_threads_num
=
numel
/
vec_size
;
int
64_t
active_threads_num
=
numel
/
vec_size
;
if
(
active_threads_num
/
(
sm_count
<<
1
)
<
limit_threads
)
{
if
(
active_threads_num
/
(
sm_count
<<
1
)
<
limit_threads
)
{
// Round up threads number into an exponential multiple of 2, while number
// Round up threads number into an exponential multiple of 2, while number
// of acitve blocks is about twice of SM, to acquire better performance.
// of acitve blocks is about twice of SM, to acquire better performance.
...
@@ -132,7 +135,7 @@ inline GpuLaunchConfig GetGpuLaunchConfig1D(const phi::GPUContext& context,
...
@@ -132,7 +135,7 @@ inline GpuLaunchConfig GetGpuLaunchConfig1D(const phi::GPUContext& context,
}
}
// Number of threads per block shall be larger than 64.
// Number of threads per block shall be larger than 64.
threads
=
std
::
max
(
64
,
threads
);
threads
=
std
::
max
(
64
,
threads
);
int
blocks
=
DivUp
(
DivUp
(
numel
,
vec_size
),
threads
);
int
64_t
blocks
=
DivUp
<
int64_t
>
(
DivUp
<
int64_t
>
(
numel
,
vec_size
),
threads
);
int
limit_blocks
=
context
.
GetCUDAMaxGridDimSize
()[
0
];
int
limit_blocks
=
context
.
GetCUDAMaxGridDimSize
()[
0
];
if
(
blocks
>
limit_blocks
)
{
if
(
blocks
>
limit_blocks
)
{
blocks
=
limit_blocks
;
blocks
=
limit_blocks
;
...
@@ -146,8 +149,8 @@ inline GpuLaunchConfig GetGpuLaunchConfig1D(const phi::GPUContext& context,
...
@@ -146,8 +149,8 @@ inline GpuLaunchConfig GetGpuLaunchConfig1D(const phi::GPUContext& context,
}
}
inline
GpuLaunchConfig
GetGpuLaunchConfig2D
(
const
phi
::
GPUContext
&
context
,
inline
GpuLaunchConfig
GetGpuLaunchConfig2D
(
const
phi
::
GPUContext
&
context
,
int
x_dim
,
int
64_t
x_dim
,
int
y_dim
)
{
int
64_t
y_dim
)
{
PADDLE_ENFORCE_GT
(
PADDLE_ENFORCE_GT
(
x_dim
,
x_dim
,
0
,
0
,
...
@@ -162,8 +165,10 @@ inline GpuLaunchConfig GetGpuLaunchConfig2D(const phi::GPUContext& context,
...
@@ -162,8 +165,10 @@ inline GpuLaunchConfig GetGpuLaunchConfig2D(const phi::GPUContext& context,
y_dim
));
y_dim
));
const
int
kThreadsPerBlock
=
256
;
const
int
kThreadsPerBlock
=
256
;
int
block_cols
=
(
std
::
min
)(
x_dim
,
kThreadsPerBlock
);
// NOTE(zengjinle): cast std::min<int64_t> result to int is safe here, because
int
block_rows
=
(
std
::
max
)(
kThreadsPerBlock
/
block_cols
,
1
);
// kThreadsPerBlock is always very small.
int
block_cols
=
std
::
min
<
int64_t
>
(
x_dim
,
kThreadsPerBlock
);
int
block_rows
=
std
::
max
<
int64_t
>
(
kThreadsPerBlock
/
block_cols
,
1
);
int
max_physical_threads
=
context
.
GetMaxPhysicalThreadCount
();
int
max_physical_threads
=
context
.
GetMaxPhysicalThreadCount
();
const
int
max_blocks
=
(
std
::
max
)(
max_physical_threads
/
kThreadsPerBlock
,
1
);
const
int
max_blocks
=
(
std
::
max
)(
max_physical_threads
/
kThreadsPerBlock
,
1
);
...
@@ -172,9 +177,9 @@ inline GpuLaunchConfig GetGpuLaunchConfig2D(const phi::GPUContext& context,
...
@@ -172,9 +177,9 @@ inline GpuLaunchConfig GetGpuLaunchConfig2D(const phi::GPUContext& context,
// Noticed, block size is not align to 32, if needed do it yourself.
// Noticed, block size is not align to 32, if needed do it yourself.
config
.
thread_per_block
=
dim3
(
block_cols
,
block_rows
,
1
);
config
.
thread_per_block
=
dim3
(
block_cols
,
block_rows
,
1
);
int
grid_x
=
(
std
::
min
)(
DivUp
(
x_dim
,
block_cols
),
max_blocks
);
int
grid_x
=
std
::
min
<
int64_t
>
(
DivUp
<
int64_t
>
(
x_dim
,
block_cols
),
max_blocks
);
int
grid_y
=
int
grid_y
=
std
::
min
<
int64_t
>
(
max_blocks
/
grid_x
,
(
std
::
min
)(
max_blocks
/
grid_x
,
(
std
::
max
)
(
y_dim
/
block_rows
,
1
));
std
::
max
<
int64_t
>
(
y_dim
/
block_rows
,
1
));
config
.
block_per_grid
=
dim3
(
grid_x
,
grid_y
,
1
);
config
.
block_per_grid
=
dim3
(
grid_x
,
grid_y
,
1
);
return
config
;
return
config
;
...
...
paddle/fluid/platform/device/gpu/rocm/rocm_helper.h
浏览文件 @
a1dbee23
...
@@ -68,7 +68,8 @@ namespace platform {
...
@@ -68,7 +68,8 @@ namespace platform {
*/
*/
#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type) \
#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type) \
int64_t __index__ = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; \
int64_t __index__ = \
static_cast<int64_t>(hipBlockIdx_x) * hipBlockDim_x + hipThreadIdx_x; \
for (index_type i = __index__; __index__ < (num); \
for (index_type i = __index__; __index__ < (num); \
__index__ += hipBlockDim_x * hipGridDim_x, i = __index__)
__index__ += hipBlockDim_x * hipGridDim_x, i = __index__)
...
...
paddle/phi/backends/gpu/cuda/cuda_helper.h
浏览文件 @
a1dbee23
...
@@ -63,7 +63,8 @@ namespace gpu {
...
@@ -63,7 +63,8 @@ namespace gpu {
*/
*/
#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type) \
#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type) \
int64_t __index__ = blockIdx.x * blockDim.x + threadIdx.x; \
int64_t __index__ = \
static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x; \
for (index_type i = __index__; __index__ < (num); \
for (index_type i = __index__; __index__ < (num); \
__index__ += blockDim.x * gridDim.x, i = __index__)
__index__ += blockDim.x * gridDim.x, i = __index__)
...
...
paddle/phi/backends/gpu/gpu_launch_config.h
浏览文件 @
a1dbee23
...
@@ -162,8 +162,8 @@ inline GpuLaunchConfig GetGpuLaunchConfig1D(const phi::GPUContext& context,
...
@@ -162,8 +162,8 @@ inline GpuLaunchConfig GetGpuLaunchConfig1D(const phi::GPUContext& context,
}
}
inline
GpuLaunchConfig
GetGpuLaunchConfig2D
(
const
phi
::
GPUContext
&
context
,
inline
GpuLaunchConfig
GetGpuLaunchConfig2D
(
const
phi
::
GPUContext
&
context
,
int
x_dim
,
int
64_t
x_dim
,
int
y_dim
)
{
int
64_t
y_dim
)
{
PADDLE_ENFORCE_GT
(
PADDLE_ENFORCE_GT
(
x_dim
,
x_dim
,
0
,
0
,
...
@@ -178,7 +178,7 @@ inline GpuLaunchConfig GetGpuLaunchConfig2D(const phi::GPUContext& context,
...
@@ -178,7 +178,7 @@ inline GpuLaunchConfig GetGpuLaunchConfig2D(const phi::GPUContext& context,
y_dim
));
y_dim
));
const
int
kThreadsPerBlock
=
256
;
const
int
kThreadsPerBlock
=
256
;
int
block_cols
=
std
::
min
(
x_dim
,
kThreadsPerBlock
);
int
block_cols
=
std
::
min
<
int64_t
>
(
x_dim
,
kThreadsPerBlock
);
int
block_rows
=
std
::
max
(
kThreadsPerBlock
/
block_cols
,
1
);
int
block_rows
=
std
::
max
(
kThreadsPerBlock
/
block_cols
,
1
);
int
max_physical_threads
=
context
.
GetMaxPhysicalThreadCount
();
int
max_physical_threads
=
context
.
GetMaxPhysicalThreadCount
();
...
@@ -188,8 +188,9 @@ inline GpuLaunchConfig GetGpuLaunchConfig2D(const phi::GPUContext& context,
...
@@ -188,8 +188,9 @@ inline GpuLaunchConfig GetGpuLaunchConfig2D(const phi::GPUContext& context,
// Noticed, block size is not align to 32, if needed do it yourself.
// Noticed, block size is not align to 32, if needed do it yourself.
config
.
thread_per_block
=
dim3
(
block_cols
,
block_rows
,
1
);
config
.
thread_per_block
=
dim3
(
block_cols
,
block_rows
,
1
);
int
grid_x
=
std
::
min
(
DivUp
<
int
>
(
x_dim
,
block_cols
),
max_blocks
);
int
grid_x
=
std
::
min
<
int64_t
>
(
DivUp
<
int64_t
>
(
x_dim
,
block_cols
),
max_blocks
);
int
grid_y
=
std
::
min
(
max_blocks
/
grid_x
,
std
::
max
(
y_dim
/
block_rows
,
1
));
int
grid_y
=
std
::
min
<
int64_t
>
(
max_blocks
/
grid_x
,
std
::
max
<
int64_t
>
(
y_dim
/
block_rows
,
1
));
config
.
block_per_grid
=
dim3
(
grid_x
,
grid_y
,
1
);
config
.
block_per_grid
=
dim3
(
grid_x
,
grid_y
,
1
);
return
config
;
return
config
;
...
...
paddle/phi/backends/gpu/rocm/rocm_helper.h
浏览文件 @
a1dbee23
...
@@ -63,7 +63,8 @@ namespace gpu {
...
@@ -63,7 +63,8 @@ namespace gpu {
*/
*/
#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type) \
#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type) \
int64_t __index__ = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; \
int64_t __index__ = \
static_cast<int64_t>(hipBlockIdx_x) * hipBlockDim_x + hipThreadIdx_x; \
for (index_type i = __index__; __index__ < (num); \
for (index_type i = __index__; __index__ < (num); \
__index__ += hipBlockDim_x * hipGridDim_x, i = __index__)
__index__ += hipBlockDim_x * hipGridDim_x, i = __index__)
...
...
paddle/phi/kernels/gpu/one_hot_kernel.cu
浏览文件 @
a1dbee23
...
@@ -16,6 +16,7 @@
...
@@ -16,6 +16,7 @@
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/funcs/math_function.h"
...
@@ -28,8 +29,7 @@ __global__ void FillOutputKernel(const InT* p_in_data,
...
@@ -28,8 +29,7 @@ __global__ void FillOutputKernel(const InT* p_in_data,
OutT
*
p_out_data
,
OutT
*
p_out_data
,
const
int64_t
numel
,
const
int64_t
numel
,
const
int
depth
)
{
const
int
depth
)
{
int
idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
CUDA_KERNEL_LOOP_TYPE
(
idx
,
numel
,
int64_t
)
{
if
(
idx
<
numel
)
{
PADDLE_ENFORCE
(
p_in_data
[
idx
]
>=
0
&&
p_in_data
[
idx
]
<
depth
,
PADDLE_ENFORCE
(
p_in_data
[
idx
]
>=
0
&&
p_in_data
[
idx
]
<
depth
,
"Illegal index value, Input(input) value should be "
"Illegal index value, Input(input) value should be "
"greater than or equal to 0, and less than depth [%d], "
"greater than or equal to 0, and less than depth [%d], "
...
@@ -62,9 +62,10 @@ struct OneHotV2OpCUDAFunctor {
...
@@ -62,9 +62,10 @@ struct OneHotV2OpCUDAFunctor {
auto
stream
=
ctx_
.
stream
();
auto
stream
=
ctx_
.
stream
();
funcs
::
set_constant
(
ctx_
,
out_
,
0.0
);
funcs
::
set_constant
(
ctx_
,
out_
,
0.0
);
FillOutputKernel
<<<
(
numel
+
PADDLE_CUDA_NUM_THREADS
-
1
)
/
auto
config
=
phi
::
backends
::
gpu
::
GetGpuLaunchConfig1D
(
ctx_
,
numel
);
PADDLE_CUDA_NUM_THREADS
,
PADDLE_CUDA_NUM_THREADS
,
FillOutputKernel
<<<
config
.
block_per_grid
,
config
.
thread_per_block
,
0
,
0
,
stream
>>>
(
p_in_data
,
p_out_data
,
numel
,
depth_
);
stream
>>>
(
p_in_data
,
p_out_data
,
numel
,
depth_
);
}
}
...
...
paddle/phi/kernels/gpu/stack_kernel.cu
浏览文件 @
a1dbee23
...
@@ -23,20 +23,23 @@ namespace phi {
...
@@ -23,20 +23,23 @@ namespace phi {
template
<
typename
T
,
typename
IntType
>
template
<
typename
T
,
typename
IntType
>
__global__
void
StackCUDAKernel
(
T
**
input_ptrs
,
__global__
void
StackCUDAKernel
(
T
**
input_ptrs
,
int
split_size
,
IntType
split_size
,
int
rows
,
IntType
rows
,
int
cols
,
IntType
cols
,
T
*
__restrict__
output
)
{
T
*
__restrict__
output
)
{
IntType
grid_x
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
IntType
grid_x
=
static_cast
<
IntType
>
(
blockIdx
.
x
)
*
blockDim
.
x
+
threadIdx
.
x
;
IntType
grid_x_stride
=
static_cast
<
IntType
>
(
blockDim
.
x
)
*
gridDim
.
x
;
IntType
grid_y_stride
=
static_cast
<
IntType
>
(
blockDim
.
y
)
*
gridDim
.
y
;
for
(;
grid_x
<
cols
;
grid_x
+=
blockDim
.
x
*
gridDim
.
x
)
{
for
(;
grid_x
<
cols
;
grid_x
+=
grid_x_stride
)
{
IntType
grid_y
=
blockIdx
.
y
*
blockDim
.
y
+
threadIdx
.
y
;
IntType
grid_y
=
static_cast
<
IntType
>
(
blockIdx
.
y
)
*
blockDim
.
y
+
threadIdx
.
y
;
IntType
split
=
grid_x
/
split_size
;
IntType
split
=
grid_x
/
split_size
;
const
T
*
input_ptr
=
input_ptrs
[
split
];
const
T
*
input_ptr
=
input_ptrs
[
split
];
IntType
col_offset
=
grid_x
%
split_size
;
IntType
col_offset
=
grid_x
%
split_size
;
#pragma unroll
#pragma unroll
for
(;
grid_y
<
rows
;
grid_y
+=
blockDim
.
y
*
gridDim
.
y
)
{
for
(;
grid_y
<
rows
;
grid_y
+=
grid_y_stride
)
{
output
[
grid_y
*
cols
+
grid_x
]
=
output
[
grid_y
*
cols
+
grid_x
]
=
input_ptr
[
grid_y
*
split_size
+
col_offset
];
input_ptr
[
grid_y
*
split_size
+
col_offset
];
}
}
...
@@ -69,12 +72,12 @@ void StackKernel(const Context& dev_ctx,
...
@@ -69,12 +72,12 @@ void StackKernel(const Context& dev_ctx,
dev_ctx
.
stream
());
dev_ctx
.
stream
());
// Split x dim from axis to matrix
// Split x dim from axis to matrix
int
x_row
=
1
,
x_col
=
1
;
int
64_t
x_row
=
1
,
x_col
=
1
;
for
(
int
i
=
0
;
i
<
axis
;
++
i
)
{
for
(
int
i
=
0
;
i
<
axis
;
++
i
)
{
x_row
*=
x
[
0
]
->
dims
()[
i
];
x_row
*=
x
[
0
]
->
dims
()[
i
];
}
}
x_col
=
x
[
0
]
->
numel
()
/
x_row
;
x_col
=
x
[
0
]
->
numel
()
/
x_row
;
int
out_col
=
x_col
*
n
;
int
64_t
out_col
=
x_col
*
n
;
auto
config
=
auto
config
=
phi
::
backends
::
gpu
::
GetGpuLaunchConfig2D
(
dev_ctx
,
out_col
,
x_row
);
phi
::
backends
::
gpu
::
GetGpuLaunchConfig2D
(
dev_ctx
,
out_col
,
x_row
);
...
@@ -85,9 +88,9 @@ void StackKernel(const Context& dev_ctx,
...
@@ -85,9 +88,9 @@ void StackKernel(const Context& dev_ctx,
config
.
thread_per_block
,
config
.
thread_per_block
,
0
,
0
,
dev_ctx
.
stream
()
>>>
(
reinterpret_cast
<
T
**>
(
tmp_x_data
->
ptr
()),
dev_ctx
.
stream
()
>>>
(
reinterpret_cast
<
T
**>
(
tmp_x_data
->
ptr
()),
x_col
,
static_cast
<
int32_t
>
(
x_col
)
,
x_row
,
static_cast
<
int32_t
>
(
x_row
)
,
out_col
,
static_cast
<
int32_t
>
(
out_col
)
,
y_data
);
y_data
);
}
else
{
}
else
{
StackCUDAKernel
<
T
,
int64_t
>
StackCUDAKernel
<
T
,
int64_t
>
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录