Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
393a0b16
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
393a0b16
编写于
8月 02, 2021
作者:
L
Leo Chen
提交者:
GitHub
8月 02, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[NPU] refine nan check (#34508)
上级
a6f55e48
变更
1
显示空白变更内容
内联
并排
Showing
1 changed file
with
23 addition
and
25 deletion
+23
-25
paddle/fluid/operators/collective/c_allreduce_op.h
paddle/fluid/operators/collective/c_allreduce_op.h
+23
-25
未找到文件。
paddle/fluid/operators/collective/c_allreduce_op.h
浏览文件 @
393a0b16
...
...
@@ -123,32 +123,30 @@ class CAllReduceOpCPUKernel : public framework::OpKernel<T> {
#if defined(PADDLE_WITH_ASCEND_CL)
// return true if found_inf_or_nan or return false;
template
<
typename
T
>
bool
C
heckNumerics
(
const
framework
::
ExecutionContext
&
exe_ctx
,
aclrtStream
stream
,
const
paddle
::
framework
::
Tensor
*
in
)
{
bool
C
ontainsNan
(
const
framework
::
ExecutionContext
&
exe_ctx
,
aclrtStream
stream
,
const
paddle
::
framework
::
Tensor
*
in
)
{
auto
&
dev_ctx
=
exe_ctx
.
template
device_context
<
paddle
::
platform
::
NPUDeviceContext
>();
using
Tensor
=
paddle
::
framework
::
Tensor
;
Tensor
out
(
in
->
type
());
out
.
Resize
(
in
->
dims
());
out
.
mutable_data
<
T
>
(
dev_ctx
.
GetPlace
());
bool
found_inf_data
=
false
;
try
{
const
auto
&
runner
=
NpuOpRunner
(
"CheckNumerics"
,
{
*
in
},
{
out
},
{{
"message"
,
std
::
string
(
"check_numberics"
)}});
runner
.
Run
(
stream
);
dev_ctx
.
Wait
();
}
catch
(
platform
::
EnforceNotMet
&
exception
)
{
LOG
(
WARNING
)
<<
"[check_nan_and_inf] detected contains NaN or INF!!!"
;
found_inf_data
=
true
;
}
catch
(...)
{
LOG
(
WARNING
)
<<
"[check_nan_and_inf] detected contains NaN or INF!!!"
;
found_inf_data
=
true
;
Tensor
mean
(
in
->
type
());
mean
.
Resize
({
1
});
mean
.
mutable_data
<
T
>
(
dev_ctx
.
GetPlace
());
std
::
vector
<
int
>
axes
;
for
(
int
i
=
0
;
i
<
in
->
dims
().
size
();
++
i
)
{
axes
.
push_back
(
i
);
}
const
auto
&
runner_mean
=
NpuOpRunner
(
"ReduceMeanD"
,
{
*
in
},
{
mean
},
{{
"axes"
,
axes
},
{
"keep_dims"
,
false
}});
std
::
vector
<
T
>
vec
;
TensorToVector
(
mean
,
exe_ctx
.
device_context
(),
&
vec
);
return
found_inf_data
;
if
(
std
::
isnan
(
static_cast
<
float
>
(
vec
[
0
])))
{
return
true
;
}
return
false
;
}
#endif
...
...
@@ -216,22 +214,22 @@ class CAllReduceOpASCENDKernel : public framework::OpKernel<T> {
framework
::
Tensor
tmp
;
tmp
.
mutable_data
<
float
>
({
8
},
ctx
.
GetPlace
());
bool
check_numerics
=
false
;
bool
has_nan
=
false
;
auto
d_type
=
in
->
type
();
switch
(
d_type
)
{
case
framework
::
proto
::
VarType
::
FP16
:
case
framework
::
proto
::
VarType
::
FP32
:
{
VLOG
(
4
)
<<
"prepare to
FoundNanInf
"
;
check_numerics
=
CheckNumerics
<
T
>
(
ctx
,
dev_ctx
->
stream
(),
in
);
VLOG
(
4
)
<<
"
check_numerics:"
<<
check_numerics
;
VLOG
(
4
)
<<
"prepare to
check nan
"
;
has_nan
=
ContainsNan
<
T
>
(
ctx
,
dev_ctx
->
stream
(),
in
);
VLOG
(
4
)
<<
"
ContainsNan:"
<<
has_nan
;
break
;
}
default:
break
;
}
if
(
check_numerics
)
{
if
(
has_nan
)
{
T
inf
=
static_cast
<
T
>
(
std
::
numeric_limits
<
float
>::
infinity
());
VLOG
(
4
)
<<
"fill input data constant inf"
;
auto
dims
=
in
->
dims
();
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录