Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
d7679426
P
Paddle
项目概览
PaddlePaddle
/
Paddle
1 年多 前同步成功
通知
2302
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
d7679426
编写于
8月 31, 2023
作者:
R
ronnywang
提交者:
GitHub
8月 31, 2023
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[ROCM] Remove the constraint with a maximum number of threads per block of 256, P1 (#56699)
上级
68f8176b
变更
9
显示空白变更内容
内联
并排
Showing
9 changed file
with
2 addition
and
67 deletion
+2
-67
paddle/fluid/operators/graph_khop_sampler_op.cu
paddle/fluid/operators/graph_khop_sampler_op.cu
+1
-9
paddle/fluid/operators/math/unpooling.cu
paddle/fluid/operators/math/unpooling.cu
+0
-16
paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
...e/fluid/operators/optimizers/distributed_fused_lamb_op.cu
+0
-8
paddle/fluid/platform/device/gpu/gpu_launch_config.h
paddle/fluid/platform/device/gpu/gpu_launch_config.h
+0
-9
paddle/phi/backends/gpu/gpu_launch_config.h
paddle/phi/backends/gpu/gpu_launch_config.h
+0
-9
paddle/phi/kernels/funcs/cross_entropy.cu
paddle/phi/kernels/funcs/cross_entropy.cu
+0
-4
paddle/phi/kernels/funcs/elementwise_grad_base.h
paddle/phi/kernels/funcs/elementwise_grad_base.h
+0
-4
paddle/phi/kernels/funcs/for_range.h
paddle/phi/kernels/funcs/for_range.h
+1
-4
paddle/phi/kernels/gpu/lars_momentum_kernel.cu
paddle/phi/kernels/gpu/lars_momentum_kernel.cu
+0
-4
未找到文件。
paddle/fluid/operators/graph_khop_sampler_op.cu
浏览文件 @
d7679426
...
@@ -287,11 +287,7 @@ void FillHashTable(const framework::ExecutionContext& ctx,
...
@@ -287,11 +287,7 @@ void FillHashTable(const framework::ExecutionContext& ctx,
thrust
::
device_vector
<
T
>*
keys
,
thrust
::
device_vector
<
T
>*
keys
,
thrust
::
device_vector
<
T
>*
values
,
thrust
::
device_vector
<
T
>*
values
,
thrust
::
device_vector
<
int64_t
>*
key_index
)
{
thrust
::
device_vector
<
int64_t
>*
key_index
)
{
#ifdef PADDLE_WITH_HIP
int
block
=
256
;
#else
int
block
=
1024
;
int
block
=
1024
;
#endif
const
auto
&
dev_ctx
=
ctx
.
cuda_device_context
();
const
auto
&
dev_ctx
=
ctx
.
cuda_device_context
();
int
max_grid_dimx
=
dev_ctx
.
GetCUDAMaxGridDimSize
()[
0
];
int
max_grid_dimx
=
dev_ctx
.
GetCUDAMaxGridDimSize
()[
0
];
int
grid_tmp
=
(
num_input
+
block
-
1
)
/
block
;
int
grid_tmp
=
(
num_input
+
block
-
1
)
/
block
;
...
@@ -377,12 +373,8 @@ void ReindexFunc(const framework::ExecutionContext& ctx,
...
@@ -377,12 +373,8 @@ void ReindexFunc(const framework::ExecutionContext& ctx,
subset
->
resize
(
unique_items
.
size
());
subset
->
resize
(
unique_items
.
size
());
thrust
::
copy
(
unique_items
.
begin
(),
unique_items
.
end
(),
subset
->
begin
());
thrust
::
copy
(
unique_items
.
begin
(),
unique_items
.
end
(),
subset
->
begin
());
// Fill outputs with reindex result.
// Fill outputs with reindex result.
#ifdef PADDLE_WITH_HIP
int
block
=
256
;
#else
int
block
=
1024
;
int
block
=
1024
;
#endif
const
auto
&
dev_ctx
=
ctx
.
cuda_device_context
();
const
auto
&
dev_ctx
=
ctx
.
cuda_device_context
();
int64_t
max_grid_dimx
=
dev_ctx
.
GetCUDAMaxGridDimSize
()[
0
];
int64_t
max_grid_dimx
=
dev_ctx
.
GetCUDAMaxGridDimSize
()[
0
];
int64_t
grid_tmp
=
(
outputs
->
size
()
+
block
-
1
)
/
block
;
int64_t
grid_tmp
=
(
outputs
->
size
()
+
block
-
1
)
/
block
;
...
...
paddle/fluid/operators/math/unpooling.cu
浏览文件 @
d7679426
...
@@ -126,11 +126,7 @@ class Unpool2dMaxFunctor<phi::GPUContext, T> {
...
@@ -126,11 +126,7 @@ class Unpool2dMaxFunctor<phi::GPUContext, T> {
const
T
*
input_data
=
input
.
data
<
T
>
();
const
T
*
input_data
=
input
.
data
<
T
>
();
const
int
*
indices_data
=
indices
.
data
<
int
>
();
const
int
*
indices_data
=
indices
.
data
<
int
>
();
T
*
output_data
=
output
->
mutable_data
<
T
>
(
context
.
GetPlace
());
T
*
output_data
=
output
->
mutable_data
<
T
>
(
context
.
GetPlace
());
#ifdef __HIPCC__
int
threads
=
256
;
#else
int
threads
=
1024
;
int
threads
=
1024
;
#endif
int
grid
=
(
input
.
numel
()
+
threads
-
1
)
/
threads
;
int
grid
=
(
input
.
numel
()
+
threads
-
1
)
/
threads
;
KernelUnpool2dMax
<
T
>
KernelUnpool2dMax
<
T
>
<<<
grid
,
threads
,
0
,
context
.
stream
()
>>>
(
input
.
numel
(),
<<<
grid
,
threads
,
0
,
context
.
stream
()
>>>
(
input
.
numel
(),
...
@@ -167,11 +163,7 @@ class Unpool2dMaxGradFunctor<phi::GPUContext, T> {
...
@@ -167,11 +163,7 @@ class Unpool2dMaxGradFunctor<phi::GPUContext, T> {
const
T
*
output_data
=
output
.
data
<
T
>
();
const
T
*
output_data
=
output
.
data
<
T
>
();
const
T
*
output_grad_data
=
output_grad
.
data
<
T
>
();
const
T
*
output_grad_data
=
output_grad
.
data
<
T
>
();
T
*
input_grad_data
=
input_grad
->
mutable_data
<
T
>
(
context
.
GetPlace
());
T
*
input_grad_data
=
input_grad
->
mutable_data
<
T
>
(
context
.
GetPlace
());
#ifdef __HIPCC__
int
threads
=
256
;
#else
int
threads
=
1024
;
int
threads
=
1024
;
#endif
int
grid
=
(
input
.
numel
()
+
threads
-
1
)
/
threads
;
int
grid
=
(
input
.
numel
()
+
threads
-
1
)
/
threads
;
KernelUnpool2dMaxGrad
<
T
>
KernelUnpool2dMaxGrad
<
T
>
<<<
grid
,
threads
,
0
,
context
.
stream
()
>>>
(
input
.
numel
(),
<<<
grid
,
threads
,
0
,
context
.
stream
()
>>>
(
input
.
numel
(),
...
@@ -206,11 +198,7 @@ class Unpool3dMaxFunctor<phi::GPUContext, T> {
...
@@ -206,11 +198,7 @@ class Unpool3dMaxFunctor<phi::GPUContext, T> {
const
T
*
input_data
=
input
.
data
<
T
>
();
const
T
*
input_data
=
input
.
data
<
T
>
();
const
int
*
indices_data
=
indices
.
data
<
int
>
();
const
int
*
indices_data
=
indices
.
data
<
int
>
();
T
*
output_data
=
output
->
mutable_data
<
T
>
(
context
.
GetPlace
());
T
*
output_data
=
output
->
mutable_data
<
T
>
(
context
.
GetPlace
());
#ifdef __HIPCC__
int
threads
=
256
;
#else
int
threads
=
1024
;
int
threads
=
1024
;
#endif
int
grid
=
(
input
.
numel
()
+
threads
-
1
)
/
threads
;
int
grid
=
(
input
.
numel
()
+
threads
-
1
)
/
threads
;
KernelUnpool3dMax
<
T
>
KernelUnpool3dMax
<
T
>
<<<
grid
,
threads
,
0
,
context
.
stream
()
>>>
(
input
.
numel
(),
<<<
grid
,
threads
,
0
,
context
.
stream
()
>>>
(
input
.
numel
(),
...
@@ -251,11 +239,7 @@ class Unpool3dMaxGradFunctor<phi::GPUContext, T> {
...
@@ -251,11 +239,7 @@ class Unpool3dMaxGradFunctor<phi::GPUContext, T> {
const
T
*
output_data
=
output
.
data
<
T
>
();
const
T
*
output_data
=
output
.
data
<
T
>
();
const
T
*
output_grad_data
=
output_grad
.
data
<
T
>
();
const
T
*
output_grad_data
=
output_grad
.
data
<
T
>
();
T
*
input_grad_data
=
input_grad
->
mutable_data
<
T
>
(
context
.
GetPlace
());
T
*
input_grad_data
=
input_grad
->
mutable_data
<
T
>
(
context
.
GetPlace
());
#ifdef __HIPCC__
int
threads
=
256
;
#else
int
threads
=
1024
;
int
threads
=
1024
;
#endif
int
grid
=
(
input
.
numel
()
+
threads
-
1
)
/
threads
;
int
grid
=
(
input
.
numel
()
+
threads
-
1
)
/
threads
;
KernelUnpool3dMaxGrad
<
T
>
KernelUnpool3dMaxGrad
<
T
>
<<<
grid
,
threads
,
0
,
context
.
stream
()
>>>
(
input
.
numel
(),
<<<
grid
,
threads
,
0
,
context
.
stream
()
>>>
(
input
.
numel
(),
...
...
paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
浏览文件 @
d7679426
...
@@ -170,11 +170,7 @@ static void MultiTensorL2Norm(const phi::GPUPlace &place,
...
@@ -170,11 +170,7 @@ static void MultiTensorL2Norm(const phi::GPUPlace &place,
constexpr
int
kNumTensor
=
MaxTensorNumPerLaunch
;
constexpr
int
kNumTensor
=
MaxTensorNumPerLaunch
;
constexpr
int
kNumChunk
=
MaxChunkNumPerLaunch
;
constexpr
int
kNumChunk
=
MaxChunkNumPerLaunch
;
#ifdef PADDLE_WITH_HIP
constexpr
int
kBlockDim
=
256
;
#else
constexpr
int
kBlockDim
=
512
;
constexpr
int
kBlockDim
=
512
;
#endif
int
max_chunk_num
=
-
1
;
int
max_chunk_num
=
-
1
;
int
vec_size
=
8
;
int
vec_size
=
8
;
...
@@ -812,11 +808,7 @@ static void MultiTensorUpdateLambParamAndBetaPows(
...
@@ -812,11 +808,7 @@ static void MultiTensorUpdateLambParamAndBetaPows(
phi
::
errors
::
InvalidArgument
(
"Beta2Pow should be nullptr."
));
phi
::
errors
::
InvalidArgument
(
"Beta2Pow should be nullptr."
));
}
}
#ifdef PADDLE_WITH_HIP
const
int
block_dim
=
256
;
#else
const
int
block_dim
=
512
;
const
int
block_dim
=
512
;
#endif
int
vec_size
=
8
;
int
vec_size
=
8
;
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
...
...
paddle/fluid/platform/device/gpu/gpu_launch_config.h
浏览文件 @
d7679426
...
@@ -32,14 +32,9 @@
...
@@ -32,14 +32,9 @@
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/device_context.h"
#ifdef __HIPCC__
// HIP results in error or nan if > 256
#define PREDEFINED_BLOCK_SIZE 256
#else
/* CUDA performs better as thread_per_block
/* CUDA performs better as thread_per_block
num is between [64, 512] */
num is between [64, 512] */
#define PREDEFINED_BLOCK_SIZE 512
#define PREDEFINED_BLOCK_SIZE 512
#endif
namespace
paddle
{
namespace
paddle
{
namespace
platform
{
namespace
platform
{
...
@@ -58,11 +53,7 @@ static inline int RoundToPowerOfTwo(int n) {
...
@@ -58,11 +53,7 @@ static inline int RoundToPowerOfTwo(int n) {
n
|=
(
n
>>
4
);
n
|=
(
n
>>
4
);
n
|=
(
n
>>
8
);
n
|=
(
n
>>
8
);
n
|=
(
n
>>
16
);
n
|=
(
n
>>
16
);
#ifdef __HIPCC__
return
std
::
min
(
256
,
std
::
max
(
32
,
(
n
+
1
)));
#else
return
std
::
min
(
1024
,
std
::
max
(
32
,
(
n
+
1
)));
return
std
::
min
(
1024
,
std
::
max
(
32
,
(
n
+
1
)));
#endif
}
}
#ifdef WITH_NV_JETSON
#ifdef WITH_NV_JETSON
...
...
paddle/phi/backends/gpu/gpu_launch_config.h
浏览文件 @
d7679426
...
@@ -34,13 +34,8 @@
...
@@ -34,13 +34,8 @@
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/enforce.h"
#include "paddle/phi/core/enforce.h"
#ifdef __HIPCC__
// HIP results in error or nan if > 256
#define PREDEFINED_BLOCK_SIZE 256
#else
// CUDA performs better when thread_per_block is between [64, 512]
// CUDA performs better when thread_per_block is between [64, 512]
#define PREDEFINED_BLOCK_SIZE 512
#define PREDEFINED_BLOCK_SIZE 512
#endif
namespace
phi
{
namespace
phi
{
namespace
backends
{
namespace
backends
{
...
@@ -69,11 +64,7 @@ inline int64_t RoundToNextHighPowOfTwo(int64_t n, int64_t min_val = 1) {
...
@@ -69,11 +64,7 @@ inline int64_t RoundToNextHighPowOfTwo(int64_t n, int64_t min_val = 1) {
inline
int64_t
RoundToPowerOfTwo
(
int64_t
n
)
{
inline
int64_t
RoundToPowerOfTwo
(
int64_t
n
)
{
constexpr
int64_t
min_val
=
32
;
constexpr
int64_t
min_val
=
32
;
int64_t
num
=
RoundToNextHighPowOfTwo
(
n
,
min_val
);
int64_t
num
=
RoundToNextHighPowOfTwo
(
n
,
min_val
);
#ifdef __HIPCC__
int64_t
max_val
=
256
;
#else
int64_t
max_val
=
1024
;
int64_t
max_val
=
1024
;
#endif
return
std
::
min
(
max_val
,
num
);
return
std
::
min
(
max_val
,
num
);
}
}
...
...
paddle/phi/kernels/funcs/cross_entropy.cu
浏览文件 @
d7679426
...
@@ -124,11 +124,7 @@ void CrossEntropyFunctor<DeviceContext, T>::operator()(
...
@@ -124,11 +124,7 @@ void CrossEntropyFunctor<DeviceContext, T>::operator()(
int
batch_size
=
prob
->
dims
()[
0
];
int
batch_size
=
prob
->
dims
()[
0
];
int
class_num
=
prob
->
dims
()[
1
];
int
class_num
=
prob
->
dims
()[
1
];
#ifdef __HIPCC__
constexpr
int
kMaxBlockDim
=
256
;
#else
constexpr
int
kMaxBlockDim
=
512
;
constexpr
int
kMaxBlockDim
=
512
;
#endif
if
(
softLabel
)
{
if
(
softLabel
)
{
const
T
*
label_data
=
labels
->
data
<
T
>
();
const
T
*
label_data
=
labels
->
data
<
T
>
();
...
...
paddle/phi/kernels/funcs/elementwise_grad_base.h
浏览文件 @
d7679426
...
@@ -32,11 +32,7 @@ limitations under the License. */
...
@@ -32,11 +32,7 @@ limitations under the License. */
#endif
#endif
#ifdef __HIPCC__
constexpr
int
ELEMWISE_MAX_BLOCK_DIM
=
256
;
#else
constexpr
int
ELEMWISE_MAX_BLOCK_DIM
=
1024
;
constexpr
int
ELEMWISE_MAX_BLOCK_DIM
=
1024
;
#endif
#define BLOCK_X 32
#define BLOCK_X 32
#define BLOCK_Y 32
#define BLOCK_Y 32
...
...
paddle/phi/kernels/funcs/for_range.h
浏览文件 @
d7679426
...
@@ -65,10 +65,7 @@ struct ForRange<phi::GPUContext> {
...
@@ -65,10 +65,7 @@ struct ForRange<phi::GPUContext> {
template
<
typename
Function
>
template
<
typename
Function
>
inline
void
operator
()(
Function
func
)
const
{
inline
void
operator
()(
Function
func
)
const
{
#ifdef __HIPCC__
#if WITH_NV_JETSON
// HIP will throw core dump when threads > 256
constexpr
int
num_threads
=
256
;
#elif WITH_NV_JETSON
// JETSON_NANO will throw core dump when threads > 128
// JETSON_NANO will throw core dump when threads > 128
int
num_thread
=
256
;
int
num_thread
=
256
;
backends
::
gpu
::
ChangeThreadNum
(
dev_ctx_
,
&
num_thread
,
128
);
backends
::
gpu
::
ChangeThreadNum
(
dev_ctx_
,
&
num_thread
,
128
);
...
...
paddle/phi/kernels/gpu/lars_momentum_kernel.cu
浏览文件 @
d7679426
...
@@ -25,11 +25,7 @@
...
@@ -25,11 +25,7 @@
#include <cooperative_groups.h>
#include <cooperative_groups.h>
#endif
#endif
#ifdef __HIPCC__
#define LARS_BLOCK_SIZE 256
#else
#define LARS_BLOCK_SIZE 512
#define LARS_BLOCK_SIZE 512
#endif
#define LARS_MAX_MERGED_OPS 60
#define LARS_MAX_MERGED_OPS 60
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录