Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
02cc3c5e
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
02cc3c5e
编写于
7月 29, 2021
作者:
G
gongweibao
提交者:
GitHub
7月 29, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Fix allreduce_sum potential bugs on NPU. (#34462)
上级
b56dbe08
变更
2
隐藏空白更改
内联
并排
Showing
2 changed file
with
84 addition
and
8 deletion
+84
-8
paddle/fluid/framework/section_worker.cc
paddle/fluid/framework/section_worker.cc
+5
-0
paddle/fluid/operators/collective/c_allreduce_op.h
paddle/fluid/operators/collective/c_allreduce_op.h
+79
-8
未找到文件。
paddle/fluid/framework/section_worker.cc
浏览文件 @
02cc3c5e
...
...
@@ -164,6 +164,7 @@ void SectionWorker::Run1F1B(std::unique_ptr<GarbageCollector> &gc) {
while
(
fw_step
<
startup_steps
)
{
RunForward
(
fw_step
,
gc
,
unused_vars_
);
fw_step
+=
1
;
VLOG
(
2
)
<<
"micro steps fw_step:"
<<
fw_step
;
}
// 1f1b phase
...
...
@@ -180,6 +181,7 @@ void SectionWorker::Run1F1B(std::unique_ptr<GarbageCollector> &gc) {
fw_step
+=
1
;
bw_step
+=
1
;
VLOG
(
2
)
<<
"micro steps fw_step:"
<<
fw_step
<<
", bw_step:"
<<
bw_step
;
}
int
reserve_bw_send_step
=
bw_step
-
2
;
...
...
@@ -187,8 +189,10 @@ void SectionWorker::Run1F1B(std::unique_ptr<GarbageCollector> &gc) {
while
(
bw_step
<
num_microbatches_
)
{
RunBackward
(
bw_step
,
gc
,
unused_vars_
);
bw_step
+=
1
;
VLOG
(
2
)
<<
"micro steps bw_step:"
<<
bw_step
;
}
VLOG
(
2
)
<<
"run update"
;
RunUpdate
(
gc
,
unused_vars_
);
if
(
gc
)
{
...
...
@@ -203,6 +207,7 @@ void SectionWorker::Run1F1B(std::unique_ptr<GarbageCollector> &gc) {
void
SectionWorker
::
TrainFiles
()
{
VLOG
(
5
)
<<
"begin section_worker TrainFiles"
;
VLOG
(
2
)
<<
"mini batch steps:"
<<
batch_id_
;
int64_t
max_memory_size
=
GetEagerDeletionThreshold
();
std
::
unique_ptr
<
GarbageCollector
>
gc
;
...
...
paddle/fluid/operators/collective/c_allreduce_op.h
浏览文件 @
02cc3c5e
...
...
@@ -21,6 +21,7 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/memory/memory.h"
#include "paddle/fluid/operators/npu_op_runner.h"
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_XPU_BKCL)
...
...
@@ -119,13 +120,45 @@ class CAllReduceOpCPUKernel : public framework::OpKernel<T> {
}
};
#if defined(PADDLE_WITH_ASCEND_CL)
// return true if found_inf_or_nan or return false;
template
<
typename
T
>
bool
CheckNumerics
(
const
framework
::
ExecutionContext
&
exe_ctx
,
aclrtStream
stream
,
const
paddle
::
framework
::
Tensor
*
in
)
{
auto
&
dev_ctx
=
exe_ctx
.
template
device_context
<
paddle
::
platform
::
NPUDeviceContext
>();
using
Tensor
=
paddle
::
framework
::
Tensor
;
Tensor
out
(
in
->
type
());
out
.
Resize
(
in
->
dims
());
out
.
mutable_data
<
T
>
(
dev_ctx
.
GetPlace
());
bool
found_inf_data
=
false
;
try
{
const
auto
&
runner
=
NpuOpRunner
(
"CheckNumerics"
,
{
*
in
},
{
out
},
{{
"message"
,
std
::
string
(
"check_numberics"
)}});
runner
.
Run
(
stream
);
dev_ctx
.
Wait
();
}
catch
(
platform
::
EnforceNotMet
&
exception
)
{
LOG
(
WARNING
)
<<
"[check_nan_and_inf] detected contains NaN or INF!!!"
;
found_inf_data
=
true
;
}
catch
(...)
{
LOG
(
WARNING
)
<<
"[check_nan_and_inf] detected contains NaN or INF!!!"
;
found_inf_data
=
true
;
}
return
found_inf_data
;
}
#endif
template
<
ReduceType
red_type
,
typename
T
>
class
CAllReduceOpASCENDKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
#if defined(PADDLE_WITH_ASCEND_CL)
auto
in
=
ctx
.
Input
<
framework
::
LoD
Tensor
>
(
"X"
);
auto
out
=
ctx
.
Output
<
framework
::
LoD
Tensor
>
(
"Out"
);
auto
in
=
ctx
.
Input
<
framework
::
Tensor
>
(
"X"
);
auto
out
=
ctx
.
Output
<
framework
::
Tensor
>
(
"Out"
);
auto
place
=
ctx
.
GetPlace
();
HcclDataType
dtype
=
platform
::
ToHCCLDataType
(
in
->
type
());
int64_t
numel
=
in
->
numel
();
...
...
@@ -141,9 +174,10 @@ class CAllReduceOpASCENDKernel : public framework::OpKernel<T> {
paddle
::
platform
::
HCCLCommContext
::
Instance
().
Get
(
ring_id
,
place
);
aclrtStream
stream
=
nullptr
;
auto
dev_ctx
=
platform
::
DeviceContextPool
::
Instance
().
Get
(
place
);
auto
dev_ctx
=
static_cast
<
platform
::
NPUDeviceContext
*>
(
platform
::
DeviceContextPool
::
Instance
().
Get
(
place
));
if
(
ctx
.
Attr
<
bool
>
(
"use_calc_stream"
))
{
stream
=
static_cast
<
platform
::
NPUDeviceContext
*>
(
dev_ctx
)
->
stream
();
stream
=
dev_ctx
->
stream
();
}
else
{
stream
=
comm
->
stream
();
}
...
...
@@ -171,9 +205,46 @@ class CAllReduceOpASCENDKernel : public framework::OpKernel<T> {
"Invalid reduce type: %d"
,
red_type
));
}
VLOG
(
3
)
<<
"begin hccl allreduce, parameter is: "
VLOG
(
3
)
<<
"hccl allreduce, parameter is: "
<<
"input num: "
<<
in
->
dims
()
<<
"dtype: "
<<
dtype
<<
"hccl_red_type: "
<<
hccl_red_type
<<
", group is: "
<<
group
<<
", sendbuff:"
<<
sendbuff
<<
", recvbuff:"
<<
recvbuff
<<
", out_size:"
<<
out
->
memory_size
()
<<
", use_calc_stream:"
<<
ctx
.
Attr
<
bool
>
(
"use_calc_stream"
)
<<
", stream:"
<<
stream
;
framework
::
Tensor
tmp
;
tmp
.
mutable_data
<
float
>
({
8
},
ctx
.
GetPlace
());
bool
check_numerics
=
false
;
auto
d_type
=
in
->
type
();
switch
(
d_type
)
{
case
framework
::
proto
::
VarType
::
FP16
:
case
framework
::
proto
::
VarType
::
FP32
:
{
VLOG
(
4
)
<<
"prepare to FoundNanInf"
;
check_numerics
=
CheckNumerics
<
T
>
(
ctx
,
dev_ctx
->
stream
(),
in
);
VLOG
(
4
)
<<
"check_numerics:"
<<
check_numerics
;
break
;
}
default:
break
;
}
if
(
check_numerics
)
{
T
inf
=
static_cast
<
T
>
(
std
::
numeric_limits
<
float
>::
infinity
());
VLOG
(
4
)
<<
"fill input data constant inf"
;
auto
dims
=
in
->
dims
();
auto
mutable_in
=
const_cast
<
framework
::
Tensor
*>
(
in
);
FillNpuTensorWithConstant
<
T
>
(
mutable_in
,
inf
);
mutable_in
->
Resize
(
dims
);
}
VLOG
(
3
)
<<
"hccl allreduce, parameter is: "
<<
"input num: "
<<
numel
<<
"dtype: "
<<
dtype
<<
"hccl_red_type: "
<<
hccl_red_type
<<
", group is: "
<<
group
;
<<
"hccl_red_type: "
<<
hccl_red_type
<<
", group is: "
<<
group
<<
", sendbuff:"
<<
sendbuff
<<
", recvbuff:"
<<
recvbuff
<<
", out_size:"
<<
out
->
memory_size
();
PADDLE_ENFORCE_NPU_SUCCESS
(
platform
::
dynload
::
HcclAllReduce
(
sendbuff
,
recvbuff
,
numel
,
dtype
,
hccl_red_type
,
comm
->
comm
(),
...
...
@@ -198,7 +269,7 @@ class CAllReduceOpXPUKernel : public framework::OpKernel<T> {
auto
place
=
ctx
.
GetPlace
();
BKCLDataType
dtype
=
platform
::
ToBKCLDataType
(
in
->
type
());
int64_t
numel
=
in
->
numel
();
const
void
*
sendbuff
=
in
->
data
<
void
>
();
const
void
*
sendbuff
=
in
->
data
<
T
>
();
out
->
Resize
(
in
->
dims
());
void
*
recvbuff
=
out
->
mutable_data
<
T
>
(
place
);
...
...
@@ -260,7 +331,7 @@ class CAllReduceOpCUDAKernel : public framework::OpKernel<T> {
auto
place
=
ctx
.
GetPlace
();
ncclDataType_t
dtype
=
platform
::
ToNCCLDataType
(
in
->
type
());
int64_t
numel
=
in
->
numel
();
const
void
*
sendbuff
=
in
->
data
<
void
>
();
const
void
*
sendbuff
=
in
->
data
<
T
>
();
out
->
Resize
(
in
->
dims
());
void
*
recvbuff
=
out
->
mutable_data
<
T
>
(
place
);
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录