Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Crayon鑫
Paddle
提交
b85c8e03
P
Paddle
项目概览
Crayon鑫
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
b85c8e03
编写于
3月 09, 2021
作者:
Q
Qi Li
提交者:
GitHub
3月 09, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[ROCM] fix reduce op, test=develop (#31478)
上级
39a5424e
变更
5
隐藏空白更改
内联
并排
Showing
5 changed file
with
41 addition
and
7 deletion
+41
-7
paddle/fluid/framework/details/nan_inf_utils_detail.cu
paddle/fluid/framework/details/nan_inf_utils_detail.cu
+8
-3
paddle/fluid/imperative/reducer.cc
paddle/fluid/imperative/reducer.cc
+1
-1
paddle/fluid/operators/reduce_ops/reduce_prod_op.cu
paddle/fluid/operators/reduce_ops/reduce_prod_op.cu
+12
-0
paddle/fluid/platform/gpu_launch_config.h
paddle/fluid/platform/gpu_launch_config.h
+1
-0
python/paddle/fluid/tests/unittests/test_reduce_op.py
python/paddle/fluid/tests/unittests/test_reduce_op.py
+19
-3
未找到文件。
paddle/fluid/framework/details/nan_inf_utils_detail.cu
浏览文件 @
b85c8e03
...
@@ -82,7 +82,7 @@ __device__ __forceinline__ void PrintNanInfKernel(const T* value,
...
@@ -82,7 +82,7 @@ __device__ __forceinline__ void PrintNanInfKernel(const T* value,
}
}
__syncthreads
;
__syncthreads
;
#ifdef
PADDLE_WITH_HIP
#ifdef
__HIPCC__
if
(
true
&&
hipThreadIdx_x
==
0
)
{
if
(
true
&&
hipThreadIdx_x
==
0
)
{
printf
(
"In block %d, there has %u,%u,%u nan,inf,num
\n
"
,
hipBlockIdx_x
,
printf
(
"In block %d, there has %u,%u,%u nan,inf,num
\n
"
,
hipBlockIdx_x
,
nan_count
,
inf_count
,
num_count
);
nan_count
,
inf_count
,
num_count
);
...
@@ -156,7 +156,7 @@ void TensorCheckerVisitor<platform::CUDADeviceContext>::apply(
...
@@ -156,7 +156,7 @@ void TensorCheckerVisitor<platform::CUDADeviceContext>::apply(
"op_var2gpu_str, but now failed"
,
"op_var2gpu_str, but now failed"
,
op_var
));
op_var
));
#ifdef
PADDLE_WITH_HIP
#ifdef
__HIPCC__
PADDLE_ENFORCE_CUDA_SUCCESS
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
hipMemcpyAsync
(
gpu_str_ptr
,
iter
->
first
.
c_str
(),
op_var
.
length
()
+
1
,
hipMemcpyAsync
(
gpu_str_ptr
,
iter
->
first
.
c_str
(),
op_var
.
length
()
+
1
,
hipMemcpyHostToDevice
,
dev_ctx
->
stream
()));
hipMemcpyHostToDevice
,
dev_ctx
->
stream
()));
...
@@ -176,11 +176,16 @@ void TensorCheckerVisitor<platform::CUDADeviceContext>::apply(
...
@@ -176,11 +176,16 @@ void TensorCheckerVisitor<platform::CUDADeviceContext>::apply(
}
}
}
}
#ifdef __HIPCC__
// HIP will throw GPU memory access fault if threads > 256
const
size_t
threads
=
256
;
#else
const
size_t
threads
=
1024
;
const
size_t
threads
=
1024
;
#endif
size_t
blocks
=
size_t
blocks
=
std
::
min
(
static_cast
<
size_t
>
(
128
),
std
::
min
(
static_cast
<
size_t
>
(
128
),
static_cast
<
size_t
>
((
tensor_
.
numel
()
+
threads
-
1
)
/
threads
));
static_cast
<
size_t
>
((
tensor_
.
numel
()
+
threads
-
1
)
/
threads
));
#ifdef
PADDLE_WITH_HIP
#ifdef
__HIPCC__
hipLaunchKernelGGL
(
CheckNanInfKernel
,
dim3
(
blocks
),
dim3
(
threads
),
0
,
hipLaunchKernelGGL
(
CheckNanInfKernel
,
dim3
(
blocks
),
dim3
(
threads
),
0
,
dev_ctx
->
stream
(),
tensor_
.
data
<
T
>
(),
tensor_
.
numel
(),
dev_ctx
->
stream
(),
tensor_
.
data
<
T
>
(),
tensor_
.
numel
(),
print_num
,
gpu_str_ptr
);
print_num
,
gpu_str_ptr
);
...
...
paddle/fluid/imperative/reducer.cc
浏览文件 @
b85c8e03
...
@@ -675,7 +675,7 @@ void Reducer::MarkGroupReady(size_t group_index) {
...
@@ -675,7 +675,7 @@ void Reducer::MarkGroupReady(size_t group_index) {
cv_
.
notify_all
();
cv_
.
notify_all
();
}
}
});
});
#elif defined(PADDLE_WITH_NCCL)
#elif defined(PADDLE_WITH_
RCCL) || defined(PADDLE_WITH_
NCCL)
FusedAllReduceSchedule
(
run_order
,
group
);
FusedAllReduceSchedule
(
run_order
,
group
);
#else
#else
PADDLE_THROW
(
platform
::
errors
::
PreconditionNotMet
(
PADDLE_THROW
(
platform
::
errors
::
PreconditionNotMet
(
...
...
paddle/fluid/operators/reduce_ops/reduce_prod_op.cu
浏览文件 @
b85c8e03
...
@@ -14,6 +14,17 @@
...
@@ -14,6 +14,17 @@
#include "paddle/fluid/operators/reduce_ops/reduce_prod_op.h"
#include "paddle/fluid/operators/reduce_ops/reduce_prod_op.h"
#ifdef __HIPCC__
// Eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h:922
// do not support double in HIPCC platform (Eigen3 to be fixed)
REGISTER_OP_CUDA_KERNEL
(
reduce_prod
,
ops
::
ReduceKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
,
ops
::
ProdFunctor
>
,
ops
::
ReduceKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int
,
ops
::
ProdFunctor
>
,
ops
::
ReduceKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
,
ops
::
ProdFunctor
>
);
#else
REGISTER_OP_CUDA_KERNEL
(
reduce_prod
,
REGISTER_OP_CUDA_KERNEL
(
reduce_prod
,
ops
::
ReduceKernel
<
paddle
::
platform
::
CUDADeviceContext
,
ops
::
ReduceKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
,
ops
::
ProdFunctor
>
,
float
,
ops
::
ProdFunctor
>
,
...
@@ -23,3 +34,4 @@ REGISTER_OP_CUDA_KERNEL(reduce_prod,
...
@@ -23,3 +34,4 @@ REGISTER_OP_CUDA_KERNEL(reduce_prod,
int
,
ops
::
ProdFunctor
>
,
int
,
ops
::
ProdFunctor
>
,
ops
::
ReduceKernel
<
paddle
::
platform
::
CUDADeviceContext
,
ops
::
ReduceKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
,
ops
::
ProdFunctor
>
);
int64_t
,
ops
::
ProdFunctor
>
);
#endif
paddle/fluid/platform/gpu_launch_config.h
浏览文件 @
b85c8e03
...
@@ -42,6 +42,7 @@ struct GpuLaunchConfig {
...
@@ -42,6 +42,7 @@ struct GpuLaunchConfig {
inline
GpuLaunchConfig
GetGpuLaunchConfig1D
(
inline
GpuLaunchConfig
GetGpuLaunchConfig1D
(
const
platform
::
CUDADeviceContext
&
context
,
int
element_count
,
const
platform
::
CUDADeviceContext
&
context
,
int
element_count
,
#ifdef PADDLE_WITH_HIP
#ifdef PADDLE_WITH_HIP
// HIP will throw GPU memory access fault if threads > 256
int
max_threads
=
256
)
{
int
max_threads
=
256
)
{
#else
#else
int
max_threads
=
1024
)
{
int
max_threads
=
1024
)
{
...
...
python/paddle/fluid/tests/unittests/test_reduce_op.py
浏览文件 @
b85c8e03
...
@@ -156,9 +156,14 @@ class TestMin8DOp(OpTest):
...
@@ -156,9 +156,14 @@ class TestMin8DOp(OpTest):
class
TestProdOp
(
OpTest
):
class
TestProdOp
(
OpTest
):
def
setUp
(
self
):
def
setUp
(
self
):
self
.
op_type
=
"reduce_prod"
self
.
op_type
=
"reduce_prod"
self
.
inputs
=
{
'X'
:
np
.
random
.
random
((
5
,
6
,
10
)).
astype
(
"float64"
)}
self
.
init_data_type
()
self
.
inputs
=
{
'X'
:
np
.
random
.
random
((
5
,
6
,
10
)).
astype
(
self
.
data_type
)}
self
.
outputs
=
{
'Out'
:
self
.
inputs
[
'X'
].
prod
(
axis
=
0
)}
self
.
outputs
=
{
'Out'
:
self
.
inputs
[
'X'
].
prod
(
axis
=
0
)}
def
init_data_type
(
self
):
self
.
data_type
=
"float32"
if
core
.
is_compiled_with_rocm
(
)
else
"float64"
def
test_check_output
(
self
):
def
test_check_output
(
self
):
self
.
check_output
()
self
.
check_output
()
...
@@ -169,14 +174,19 @@ class TestProdOp(OpTest):
...
@@ -169,14 +174,19 @@ class TestProdOp(OpTest):
class
TestProd6DOp
(
OpTest
):
class
TestProd6DOp
(
OpTest
):
def
setUp
(
self
):
def
setUp
(
self
):
self
.
op_type
=
"reduce_prod"
self
.
op_type
=
"reduce_prod"
self
.
init_data_type
()
self
.
inputs
=
{
self
.
inputs
=
{
'X'
:
np
.
random
.
random
((
5
,
6
,
2
,
3
,
4
,
2
)).
astype
(
"float64"
)
'X'
:
np
.
random
.
random
((
5
,
6
,
2
,
3
,
4
,
2
)).
astype
(
self
.
data_type
)
}
}
self
.
attrs
=
{
'dim'
:
[
2
,
3
,
4
]}
self
.
attrs
=
{
'dim'
:
[
2
,
3
,
4
]}
self
.
outputs
=
{
self
.
outputs
=
{
'Out'
:
self
.
inputs
[
'X'
].
prod
(
axis
=
tuple
(
self
.
attrs
[
'dim'
]))
'Out'
:
self
.
inputs
[
'X'
].
prod
(
axis
=
tuple
(
self
.
attrs
[
'dim'
]))
}
}
def
init_data_type
(
self
):
self
.
data_type
=
"float32"
if
core
.
is_compiled_with_rocm
(
)
else
"float64"
def
test_check_output
(
self
):
def
test_check_output
(
self
):
self
.
check_output
()
self
.
check_output
()
...
@@ -187,14 +197,20 @@ class TestProd6DOp(OpTest):
...
@@ -187,14 +197,20 @@ class TestProd6DOp(OpTest):
class
TestProd8DOp
(
OpTest
):
class
TestProd8DOp
(
OpTest
):
def
setUp
(
self
):
def
setUp
(
self
):
self
.
op_type
=
"reduce_prod"
self
.
op_type
=
"reduce_prod"
self
.
init_data_type
()
self
.
inputs
=
{
self
.
inputs
=
{
'X'
:
np
.
random
.
random
((
2
,
5
,
3
,
2
,
2
,
3
,
4
,
2
)).
astype
(
"float64"
)
'X'
:
np
.
random
.
random
(
(
2
,
5
,
3
,
2
,
2
,
3
,
4
,
2
)).
astype
(
self
.
data_type
)
}
}
self
.
attrs
=
{
'dim'
:
[
2
,
3
,
4
]}
self
.
attrs
=
{
'dim'
:
[
2
,
3
,
4
]}
self
.
outputs
=
{
self
.
outputs
=
{
'Out'
:
self
.
inputs
[
'X'
].
prod
(
axis
=
tuple
(
self
.
attrs
[
'dim'
]))
'Out'
:
self
.
inputs
[
'X'
].
prod
(
axis
=
tuple
(
self
.
attrs
[
'dim'
]))
}
}
def
init_data_type
(
self
):
self
.
data_type
=
"float32"
if
core
.
is_compiled_with_rocm
(
)
else
"float64"
def
test_check_output
(
self
):
def
test_check_output
(
self
):
self
.
check_output
()
self
.
check_output
()
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录