Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
a1dbee23
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 1 年 前同步成功
通知
2299
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
a1dbee23
编写于
9月 05, 2022
作者:
S
sneaxiy
提交者:
GitHub
9月 05, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
fix some op int32 exceed range (#45711)
上级
ea50282b
变更
8
隐藏空白更改
内联
并排
Showing
8 changed file
with
58 addition
and
44 deletion
+58
-44
paddle/fluid/platform/device/gpu/cuda/cuda_helper.h
paddle/fluid/platform/device/gpu/cuda/cuda_helper.h
+4
-3
paddle/fluid/platform/device/gpu/gpu_launch_config.h
paddle/fluid/platform/device/gpu/gpu_launch_config.h
+15
-10
paddle/fluid/platform/device/gpu/rocm/rocm_helper.h
paddle/fluid/platform/device/gpu/rocm/rocm_helper.h
+4
-3
paddle/phi/backends/gpu/cuda/cuda_helper.h
paddle/phi/backends/gpu/cuda/cuda_helper.h
+4
-3
paddle/phi/backends/gpu/gpu_launch_config.h
paddle/phi/backends/gpu/gpu_launch_config.h
+6
-5
paddle/phi/backends/gpu/rocm/rocm_helper.h
paddle/phi/backends/gpu/rocm/rocm_helper.h
+4
-3
paddle/phi/kernels/gpu/one_hot_kernel.cu
paddle/phi/kernels/gpu/one_hot_kernel.cu
+6
-5
paddle/phi/kernels/gpu/stack_kernel.cu
paddle/phi/kernels/gpu/stack_kernel.cu
+15
-12
未找到文件。
paddle/fluid/platform/device/gpu/cuda/cuda_helper.h
浏览文件 @
a1dbee23
...
@@ -70,9 +70,10 @@ namespace platform {
...
@@ -70,9 +70,10 @@ namespace platform {
*
*
*/
*/
#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type) \
#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type) \
int64_t __index__ = blockIdx.x * blockDim.x + threadIdx.x; \
int64_t __index__ = \
for (index_type i = __index__; __index__ < (num); \
static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x; \
for (index_type i = __index__; __index__ < (num); \
__index__ += blockDim.x * gridDim.x, i = __index__)
__index__ += blockDim.x * gridDim.x, i = __index__)
class
CublasHandleHolder
{
class
CublasHandleHolder
{
...
...
paddle/fluid/platform/device/gpu/gpu_launch_config.h
浏览文件 @
a1dbee23
...
@@ -44,7 +44,10 @@
...
@@ -44,7 +44,10 @@
namespace
paddle
{
namespace
paddle
{
namespace
platform
{
namespace
platform
{
inline
int
DivUp
(
int
a
,
int
b
)
{
return
(
a
+
b
-
1
)
/
b
;
}
template
<
typename
T
=
int
>
inline
T
DivUp
(
T
a
,
T
b
)
{
return
(
a
+
b
-
1
)
/
b
;
}
/* https://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
/* https://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
for round integer value into next highest power of 2. */
for round integer value into next highest power of 2. */
...
@@ -120,7 +123,7 @@ inline GpuLaunchConfig GetGpuLaunchConfig1D(const phi::GPUContext& context,
...
@@ -120,7 +123,7 @@ inline GpuLaunchConfig GetGpuLaunchConfig1D(const phi::GPUContext& context,
#endif
#endif
int
threads
=
limit_threads
;
int
threads
=
limit_threads
;
int
sm_count
=
context
.
GetSMCount
();
int
sm_count
=
context
.
GetSMCount
();
int
active_threads_num
=
numel
/
vec_size
;
int
64_t
active_threads_num
=
numel
/
vec_size
;
if
(
active_threads_num
/
(
sm_count
<<
1
)
<
limit_threads
)
{
if
(
active_threads_num
/
(
sm_count
<<
1
)
<
limit_threads
)
{
// Round up threads number into an exponential multiple of 2, while number
// Round up threads number into an exponential multiple of 2, while number
// of acitve blocks is about twice of SM, to acquire better performance.
// of acitve blocks is about twice of SM, to acquire better performance.
...
@@ -132,7 +135,7 @@ inline GpuLaunchConfig GetGpuLaunchConfig1D(const phi::GPUContext& context,
...
@@ -132,7 +135,7 @@ inline GpuLaunchConfig GetGpuLaunchConfig1D(const phi::GPUContext& context,
}
}
// Number of threads per block shall be larger than 64.
// Number of threads per block shall be larger than 64.
threads
=
std
::
max
(
64
,
threads
);
threads
=
std
::
max
(
64
,
threads
);
int
blocks
=
DivUp
(
DivUp
(
numel
,
vec_size
),
threads
);
int
64_t
blocks
=
DivUp
<
int64_t
>
(
DivUp
<
int64_t
>
(
numel
,
vec_size
),
threads
);
int
limit_blocks
=
context
.
GetCUDAMaxGridDimSize
()[
0
];
int
limit_blocks
=
context
.
GetCUDAMaxGridDimSize
()[
0
];
if
(
blocks
>
limit_blocks
)
{
if
(
blocks
>
limit_blocks
)
{
blocks
=
limit_blocks
;
blocks
=
limit_blocks
;
...
@@ -146,8 +149,8 @@ inline GpuLaunchConfig GetGpuLaunchConfig1D(const phi::GPUContext& context,
...
@@ -146,8 +149,8 @@ inline GpuLaunchConfig GetGpuLaunchConfig1D(const phi::GPUContext& context,
}
}
inline
GpuLaunchConfig
GetGpuLaunchConfig2D
(
const
phi
::
GPUContext
&
context
,
inline
GpuLaunchConfig
GetGpuLaunchConfig2D
(
const
phi
::
GPUContext
&
context
,
int
x_dim
,
int
64_t
x_dim
,
int
y_dim
)
{
int
64_t
y_dim
)
{
PADDLE_ENFORCE_GT
(
PADDLE_ENFORCE_GT
(
x_dim
,
x_dim
,
0
,
0
,
...
@@ -162,8 +165,10 @@ inline GpuLaunchConfig GetGpuLaunchConfig2D(const phi::GPUContext& context,
...
@@ -162,8 +165,10 @@ inline GpuLaunchConfig GetGpuLaunchConfig2D(const phi::GPUContext& context,
y_dim
));
y_dim
));
const
int
kThreadsPerBlock
=
256
;
const
int
kThreadsPerBlock
=
256
;
int
block_cols
=
(
std
::
min
)(
x_dim
,
kThreadsPerBlock
);
// NOTE(zengjinle): cast std::min<int64_t> result to int is safe here, because
int
block_rows
=
(
std
::
max
)(
kThreadsPerBlock
/
block_cols
,
1
);
// kThreadsPerBlock is always very small.
int
block_cols
=
std
::
min
<
int64_t
>
(
x_dim
,
kThreadsPerBlock
);
int
block_rows
=
std
::
max
<
int64_t
>
(
kThreadsPerBlock
/
block_cols
,
1
);
int
max_physical_threads
=
context
.
GetMaxPhysicalThreadCount
();
int
max_physical_threads
=
context
.
GetMaxPhysicalThreadCount
();
const
int
max_blocks
=
(
std
::
max
)(
max_physical_threads
/
kThreadsPerBlock
,
1
);
const
int
max_blocks
=
(
std
::
max
)(
max_physical_threads
/
kThreadsPerBlock
,
1
);
...
@@ -172,9 +177,9 @@ inline GpuLaunchConfig GetGpuLaunchConfig2D(const phi::GPUContext& context,
...
@@ -172,9 +177,9 @@ inline GpuLaunchConfig GetGpuLaunchConfig2D(const phi::GPUContext& context,
// Noticed, block size is not align to 32, if needed do it yourself.
// Noticed, block size is not align to 32, if needed do it yourself.
config
.
thread_per_block
=
dim3
(
block_cols
,
block_rows
,
1
);
config
.
thread_per_block
=
dim3
(
block_cols
,
block_rows
,
1
);
int
grid_x
=
(
std
::
min
)(
DivUp
(
x_dim
,
block_cols
),
max_blocks
);
int
grid_x
=
std
::
min
<
int64_t
>
(
DivUp
<
int64_t
>
(
x_dim
,
block_cols
),
max_blocks
);
int
grid_y
=
int
grid_y
=
std
::
min
<
int64_t
>
(
max_blocks
/
grid_x
,
(
std
::
min
)(
max_blocks
/
grid_x
,
(
std
::
max
)
(
y_dim
/
block_rows
,
1
));
std
::
max
<
int64_t
>
(
y_dim
/
block_rows
,
1
));
config
.
block_per_grid
=
dim3
(
grid_x
,
grid_y
,
1
);
config
.
block_per_grid
=
dim3
(
grid_x
,
grid_y
,
1
);
return
config
;
return
config
;
...
...
paddle/fluid/platform/device/gpu/rocm/rocm_helper.h
浏览文件 @
a1dbee23
...
@@ -67,9 +67,10 @@ namespace platform {
...
@@ -67,9 +67,10 @@ namespace platform {
*
*
*/
*/
#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type) \
#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type) \
int64_t __index__ = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; \
int64_t __index__ = \
for (index_type i = __index__; __index__ < (num); \
static_cast<int64_t>(hipBlockIdx_x) * hipBlockDim_x + hipThreadIdx_x; \
for (index_type i = __index__; __index__ < (num); \
__index__ += hipBlockDim_x * hipGridDim_x, i = __index__)
__index__ += hipBlockDim_x * hipGridDim_x, i = __index__)
class
CublasHandleHolder
{
class
CublasHandleHolder
{
...
...
paddle/phi/backends/gpu/cuda/cuda_helper.h
浏览文件 @
a1dbee23
...
@@ -62,9 +62,10 @@ namespace gpu {
...
@@ -62,9 +62,10 @@ namespace gpu {
*
*
*/
*/
#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type) \
#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type) \
int64_t __index__ = blockIdx.x * blockDim.x + threadIdx.x; \
int64_t __index__ = \
for (index_type i = __index__; __index__ < (num); \
static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x; \
for (index_type i = __index__; __index__ < (num); \
__index__ += blockDim.x * gridDim.x, i = __index__)
__index__ += blockDim.x * gridDim.x, i = __index__)
}
// namespace gpu
}
// namespace gpu
...
...
paddle/phi/backends/gpu/gpu_launch_config.h
浏览文件 @
a1dbee23
...
@@ -162,8 +162,8 @@ inline GpuLaunchConfig GetGpuLaunchConfig1D(const phi::GPUContext& context,
...
@@ -162,8 +162,8 @@ inline GpuLaunchConfig GetGpuLaunchConfig1D(const phi::GPUContext& context,
}
}
inline
GpuLaunchConfig
GetGpuLaunchConfig2D
(
const
phi
::
GPUContext
&
context
,
inline
GpuLaunchConfig
GetGpuLaunchConfig2D
(
const
phi
::
GPUContext
&
context
,
int
x_dim
,
int
64_t
x_dim
,
int
y_dim
)
{
int
64_t
y_dim
)
{
PADDLE_ENFORCE_GT
(
PADDLE_ENFORCE_GT
(
x_dim
,
x_dim
,
0
,
0
,
...
@@ -178,7 +178,7 @@ inline GpuLaunchConfig GetGpuLaunchConfig2D(const phi::GPUContext& context,
...
@@ -178,7 +178,7 @@ inline GpuLaunchConfig GetGpuLaunchConfig2D(const phi::GPUContext& context,
y_dim
));
y_dim
));
const
int
kThreadsPerBlock
=
256
;
const
int
kThreadsPerBlock
=
256
;
int
block_cols
=
std
::
min
(
x_dim
,
kThreadsPerBlock
);
int
block_cols
=
std
::
min
<
int64_t
>
(
x_dim
,
kThreadsPerBlock
);
int
block_rows
=
std
::
max
(
kThreadsPerBlock
/
block_cols
,
1
);
int
block_rows
=
std
::
max
(
kThreadsPerBlock
/
block_cols
,
1
);
int
max_physical_threads
=
context
.
GetMaxPhysicalThreadCount
();
int
max_physical_threads
=
context
.
GetMaxPhysicalThreadCount
();
...
@@ -188,8 +188,9 @@ inline GpuLaunchConfig GetGpuLaunchConfig2D(const phi::GPUContext& context,
...
@@ -188,8 +188,9 @@ inline GpuLaunchConfig GetGpuLaunchConfig2D(const phi::GPUContext& context,
// Noticed, block size is not align to 32, if needed do it yourself.
// Noticed, block size is not align to 32, if needed do it yourself.
config
.
thread_per_block
=
dim3
(
block_cols
,
block_rows
,
1
);
config
.
thread_per_block
=
dim3
(
block_cols
,
block_rows
,
1
);
int
grid_x
=
std
::
min
(
DivUp
<
int
>
(
x_dim
,
block_cols
),
max_blocks
);
int
grid_x
=
std
::
min
<
int64_t
>
(
DivUp
<
int64_t
>
(
x_dim
,
block_cols
),
max_blocks
);
int
grid_y
=
std
::
min
(
max_blocks
/
grid_x
,
std
::
max
(
y_dim
/
block_rows
,
1
));
int
grid_y
=
std
::
min
<
int64_t
>
(
max_blocks
/
grid_x
,
std
::
max
<
int64_t
>
(
y_dim
/
block_rows
,
1
));
config
.
block_per_grid
=
dim3
(
grid_x
,
grid_y
,
1
);
config
.
block_per_grid
=
dim3
(
grid_x
,
grid_y
,
1
);
return
config
;
return
config
;
...
...
paddle/phi/backends/gpu/rocm/rocm_helper.h
浏览文件 @
a1dbee23
...
@@ -62,9 +62,10 @@ namespace gpu {
...
@@ -62,9 +62,10 @@ namespace gpu {
*
*
*/
*/
#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type) \
#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type) \
int64_t __index__ = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; \
int64_t __index__ = \
for (index_type i = __index__; __index__ < (num); \
static_cast<int64_t>(hipBlockIdx_x) * hipBlockDim_x + hipThreadIdx_x; \
for (index_type i = __index__; __index__ < (num); \
__index__ += hipBlockDim_x * hipGridDim_x, i = __index__)
__index__ += hipBlockDim_x * hipGridDim_x, i = __index__)
}
// namespace gpu
}
// namespace gpu
...
...
paddle/phi/kernels/gpu/one_hot_kernel.cu
浏览文件 @
a1dbee23
...
@@ -16,6 +16,7 @@
...
@@ -16,6 +16,7 @@
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/funcs/math_function.h"
...
@@ -28,8 +29,7 @@ __global__ void FillOutputKernel(const InT* p_in_data,
...
@@ -28,8 +29,7 @@ __global__ void FillOutputKernel(const InT* p_in_data,
OutT
*
p_out_data
,
OutT
*
p_out_data
,
const
int64_t
numel
,
const
int64_t
numel
,
const
int
depth
)
{
const
int
depth
)
{
int
idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
CUDA_KERNEL_LOOP_TYPE
(
idx
,
numel
,
int64_t
)
{
if
(
idx
<
numel
)
{
PADDLE_ENFORCE
(
p_in_data
[
idx
]
>=
0
&&
p_in_data
[
idx
]
<
depth
,
PADDLE_ENFORCE
(
p_in_data
[
idx
]
>=
0
&&
p_in_data
[
idx
]
<
depth
,
"Illegal index value, Input(input) value should be "
"Illegal index value, Input(input) value should be "
"greater than or equal to 0, and less than depth [%d], "
"greater than or equal to 0, and less than depth [%d], "
...
@@ -62,9 +62,10 @@ struct OneHotV2OpCUDAFunctor {
...
@@ -62,9 +62,10 @@ struct OneHotV2OpCUDAFunctor {
auto
stream
=
ctx_
.
stream
();
auto
stream
=
ctx_
.
stream
();
funcs
::
set_constant
(
ctx_
,
out_
,
0.0
);
funcs
::
set_constant
(
ctx_
,
out_
,
0.0
);
FillOutputKernel
<<<
(
numel
+
PADDLE_CUDA_NUM_THREADS
-
1
)
/
auto
config
=
phi
::
backends
::
gpu
::
GetGpuLaunchConfig1D
(
ctx_
,
numel
);
PADDLE_CUDA_NUM_THREADS
,
PADDLE_CUDA_NUM_THREADS
,
FillOutputKernel
<<<
config
.
block_per_grid
,
config
.
thread_per_block
,
0
,
0
,
stream
>>>
(
p_in_data
,
p_out_data
,
numel
,
depth_
);
stream
>>>
(
p_in_data
,
p_out_data
,
numel
,
depth_
);
}
}
...
...
paddle/phi/kernels/gpu/stack_kernel.cu
浏览文件 @
a1dbee23
...
@@ -23,20 +23,23 @@ namespace phi {
...
@@ -23,20 +23,23 @@ namespace phi {
template
<
typename
T
,
typename
IntType
>
template
<
typename
T
,
typename
IntType
>
__global__
void
StackCUDAKernel
(
T
**
input_ptrs
,
__global__
void
StackCUDAKernel
(
T
**
input_ptrs
,
int
split_size
,
IntType
split_size
,
int
rows
,
IntType
rows
,
int
cols
,
IntType
cols
,
T
*
__restrict__
output
)
{
T
*
__restrict__
output
)
{
IntType
grid_x
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
IntType
grid_x
=
static_cast
<
IntType
>
(
blockIdx
.
x
)
*
blockDim
.
x
+
threadIdx
.
x
;
IntType
grid_x_stride
=
static_cast
<
IntType
>
(
blockDim
.
x
)
*
gridDim
.
x
;
IntType
grid_y_stride
=
static_cast
<
IntType
>
(
blockDim
.
y
)
*
gridDim
.
y
;
for
(;
grid_x
<
cols
;
grid_x
+=
blockDim
.
x
*
gridDim
.
x
)
{
for
(;
grid_x
<
cols
;
grid_x
+=
grid_x_stride
)
{
IntType
grid_y
=
blockIdx
.
y
*
blockDim
.
y
+
threadIdx
.
y
;
IntType
grid_y
=
static_cast
<
IntType
>
(
blockIdx
.
y
)
*
blockDim
.
y
+
threadIdx
.
y
;
IntType
split
=
grid_x
/
split_size
;
IntType
split
=
grid_x
/
split_size
;
const
T
*
input_ptr
=
input_ptrs
[
split
];
const
T
*
input_ptr
=
input_ptrs
[
split
];
IntType
col_offset
=
grid_x
%
split_size
;
IntType
col_offset
=
grid_x
%
split_size
;
#pragma unroll
#pragma unroll
for
(;
grid_y
<
rows
;
grid_y
+=
blockDim
.
y
*
gridDim
.
y
)
{
for
(;
grid_y
<
rows
;
grid_y
+=
grid_y_stride
)
{
output
[
grid_y
*
cols
+
grid_x
]
=
output
[
grid_y
*
cols
+
grid_x
]
=
input_ptr
[
grid_y
*
split_size
+
col_offset
];
input_ptr
[
grid_y
*
split_size
+
col_offset
];
}
}
...
@@ -69,12 +72,12 @@ void StackKernel(const Context& dev_ctx,
...
@@ -69,12 +72,12 @@ void StackKernel(const Context& dev_ctx,
dev_ctx
.
stream
());
dev_ctx
.
stream
());
// Split x dim from axis to matrix
// Split x dim from axis to matrix
int
x_row
=
1
,
x_col
=
1
;
int
64_t
x_row
=
1
,
x_col
=
1
;
for
(
int
i
=
0
;
i
<
axis
;
++
i
)
{
for
(
int
i
=
0
;
i
<
axis
;
++
i
)
{
x_row
*=
x
[
0
]
->
dims
()[
i
];
x_row
*=
x
[
0
]
->
dims
()[
i
];
}
}
x_col
=
x
[
0
]
->
numel
()
/
x_row
;
x_col
=
x
[
0
]
->
numel
()
/
x_row
;
int
out_col
=
x_col
*
n
;
int
64_t
out_col
=
x_col
*
n
;
auto
config
=
auto
config
=
phi
::
backends
::
gpu
::
GetGpuLaunchConfig2D
(
dev_ctx
,
out_col
,
x_row
);
phi
::
backends
::
gpu
::
GetGpuLaunchConfig2D
(
dev_ctx
,
out_col
,
x_row
);
...
@@ -85,9 +88,9 @@ void StackKernel(const Context& dev_ctx,
...
@@ -85,9 +88,9 @@ void StackKernel(const Context& dev_ctx,
config
.
thread_per_block
,
config
.
thread_per_block
,
0
,
0
,
dev_ctx
.
stream
()
>>>
(
reinterpret_cast
<
T
**>
(
tmp_x_data
->
ptr
()),
dev_ctx
.
stream
()
>>>
(
reinterpret_cast
<
T
**>
(
tmp_x_data
->
ptr
()),
x_col
,
static_cast
<
int32_t
>
(
x_col
)
,
x_row
,
static_cast
<
int32_t
>
(
x_row
)
,
out_col
,
static_cast
<
int32_t
>
(
out_col
)
,
y_data
);
y_data
);
}
else
{
}
else
{
StackCUDAKernel
<
T
,
int64_t
>
StackCUDAKernel
<
T
,
int64_t
>
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录