Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
dae3e1f3
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
dae3e1f3
编写于
2月 09, 2021
作者:
S
ShenLiang
提交者:
GitHub
2月 09, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Solve inconsistent order in each card in dynamic graph (#30931)
上级
14d039e4
变更
3
隐藏空白更改
内联
并排
Showing
3 changed file
with
61 addition
and
44 deletion
+61
-44
paddle/fluid/imperative/reducer.cc
paddle/fluid/imperative/reducer.cc
+55
-42
paddle/fluid/imperative/reducer.h
paddle/fluid/imperative/reducer.h
+3
-1
paddle/fluid/imperative/tests/test_group.cc
paddle/fluid/imperative/tests/test_group.cc
+3
-1
未找到文件。
paddle/fluid/imperative/reducer.cc
浏览文件 @
dae3e1f3
...
...
@@ -181,11 +181,6 @@ void SplitTensorsWithType<platform::XPUDeviceContext>(
#endif
void
Group
::
ConcatTensors
(
const
platform
::
DeviceContext
&
context
)
{
VLOG
(
3
)
<<
"Before concat, set output tensor size is "
<<
all_length_
;
auto
tensor
=
dense_contents_
.
GetMutable
<
framework
::
LoDTensor
>
();
tensor
->
Resize
(
framework
::
make_ddim
({
all_length_
}))
.
mutable_data
(
context
.
GetPlace
(),
dtype_
);
auto
place
=
context
.
GetPlace
();
if
(
platform
::
is_gpu_place
(
place
))
{
#ifdef PADDLE_WITH_NCCL
...
...
@@ -320,6 +315,9 @@ void Reducer::InitializeDenseGroups(
p_group
->
length_
.
push_back
(
size
);
// for concat operator
p_group
->
dense_tensors_
.
push_back
(
framework
::
Tensor
());
// check the dtype and place, it must be same.
auto
dtype
=
var
->
DataType
();
auto
place
=
var
->
Place
();
...
...
@@ -341,6 +339,7 @@ void Reducer::InitializeDenseGroups(
place_
=
place
;
}
}
p_group
->
all_length_
=
all_length
;
}
// Each parameter will be initialized according to the group information.
...
...
@@ -375,6 +374,9 @@ void Reducer::InitializeGroups(
}
else
{
// process the dense gradient.
InitializeDenseGroups
(
variable_indices_
,
&
group
);
auto
tensor
=
group
.
dense_contents_
.
GetMutable
<
framework
::
LoDTensor
>
();
tensor
->
Resize
(
framework
::
make_ddim
({
group
.
all_length_
}))
.
mutable_data
(
place_
,
group
.
dtype_
);
}
// map variables to this group by VariableLocator
...
...
@@ -436,9 +438,6 @@ void Reducer::PrepareForBackward(
next_group_
=
0
;
std
::
for_each
(
groups_
.
begin
(),
groups_
.
end
(),
[](
Group
&
group
)
{
group
.
pending_
=
group
.
variable_indices_
.
size
();
group
.
all_length_
=
0
;
group
.
dense_tensors_
.
clear
();
group
.
dense_tensors_
.
reserve
(
group
.
pending_
);
group
.
sparse_contents_
=
nullptr
;
});
...
...
@@ -564,22 +563,42 @@ void Reducer::MarkVarReady(const size_t var_index, const bool is_used_var) {
auto
group_index
=
var_locator
.
group_index
;
auto
&
group
=
groups_
[
group_index
];
if
(
is_used_var
)
{
auto
var_warpper
=
vars_
[
var_index
]
->
GradVarBase
()
->
SharedVar
();
if
(
!
group
.
is_sparse_
)
{
auto
grad
=
var_warpper
->
MutableVar
();
auto
inside_group_index
=
var_locator
.
inside_group_index
;
auto
length
=
group
.
length_
[
inside_group_index
];
auto
tensor
=
grad
->
GetMutable
<
framework
::
LoDTensor
>
();
framework
::
Tensor
tmp
;
tmp
.
ShareDataWith
(
*
tensor
).
Resize
({
static_cast
<
int64_t
>
(
length
)});
group
.
dense_tensors_
.
push_back
(
std
::
move
(
tmp
));
group
.
all_length_
+=
length
;
if
(
!
group
.
is_sparse_
)
{
// process dense group
auto
inside_group_index
=
var_locator
.
inside_group_index
;
auto
length
=
group
.
length_
[
inside_group_index
];
auto
&
group_tensor
=
group
.
dense_tensors_
[
inside_group_index
];
if
(
is_used_var
)
{
auto
var_warpper
=
vars_
[
var_index
]
->
GradVarBase
()
->
SharedVar
();
auto
tensor
=
var_warpper
->
MutableVar
()
->
GetMutable
<
framework
::
LoDTensor
>
();
group_tensor
.
ShareDataWith
(
*
tensor
).
Resize
(
{
static_cast
<
int64_t
>
(
length
)});
}
else
{
if
(
!
group_tensor
.
IsInitialized
())
{
group_tensor
.
Resize
({
static_cast
<
int64_t
>
(
length
)});
group_tensor
.
mutable_data
(
place_
,
group
.
dtype_
);
#ifdef PADDLE_WITH_XPU_BKCL
if
(
platform
::
is_xpu_place
(
group_tensor
.
place
()))
{
// TODO(liuyuhui) support XPU set constant
VLOG
(
3
)
<<
"XPU doesn't support set_constant"
;
}
#else
auto
*
dev_ctx
=
platform
::
DeviceContextPool
::
Instance
().
Get
(
place_
);
operators
::
math
::
set_constant
(
*
dev_ctx
,
&
group_tensor
,
0.0
);
#endif
}
}
}
else
{
// process sparse group
if
(
is_used_var
)
{
auto
var_warpper
=
vars_
[
var_index
]
->
GradVarBase
()
->
SharedVar
();
group
.
sparse_contents_
=
var_warpper
->
MutableVar
();
}
else
{
group
.
sparse_contents_
=
nullptr
;
}
}
if
(
--
group
.
pending_
==
0
)
{
// can start allreduce
MarkGroupReady
(
group_index
);
...
...
@@ -619,36 +638,30 @@ void Reducer::MarkGroupReady(size_t group_index) {
<<
"] has no var to allreduce"
;
}
}
else
{
if
(
!
group
.
dense_tensors_
.
empty
())
{
VLOG
(
3
)
<<
"dense group ["
<<
next_group_
<<
"] start allreduce in ring["
<<
run_order
<<
"]"
;
// Select common commstream to concat tensors
// group.dense_tensors ---> group.dense_contents_
group
.
ConcatTensors
(
*
parallel_ctx_
->
GetDeviceContext
(
run_order
));
VLOG
(
3
)
<<
"dense group ["
<<
next_group_
<<
"] start allreduce in ring["
<<
run_order
<<
"]"
;
// Select common commstream to concat tensors
// group.dense_tensors ---> group.dense_contents_
group
.
ConcatTensors
(
*
parallel_ctx_
->
GetDeviceContext
(
run_order
));
// NOTE(liuyuhui): ConcatTensors use communication stream, but BKCL only support
// default stream for communicating,
// so there exist some problems in synchronization. And need to add a WaitComm
// there.
// default stream for communicating, so there exist some problems in
// synchronization. And need to add a WaitComm there.
// TODO(liuyuhui): If BKCL support events, it should be fixed as non-blocking
// communication.
#ifdef PADDLE_WITH_XPU_BKCL
if
(
platform
::
is_xpu_place
(
group
.
dense_tensors_
[
0
].
place
()))
{
parallel_ctx_
->
WaitComm
(
run_order
);
}
if
(
platform
::
is_xpu_place
(
group
.
dense_tensors_
[
0
].
place
()))
{
parallel_ctx_
->
WaitComm
(
run_order
);
}
#endif
// Start allreduce
parallel_ctx_
->
AllReduceByStream
(
group
.
dense_contents_
,
&
(
group
.
dense_contents_
),
run_order
,
false
);
// Start allreduce
parallel_ctx_
->
AllReduceByStream
(
group
.
dense_contents_
,
&
(
group
.
dense_contents_
),
run_order
,
false
);
// Select common commstream to split tensors
// group.dense_contents_ ---> group.dense_tensors
group
.
SplitTensors
(
*
parallel_ctx_
->
GetDeviceContext
(
run_order
));
}
else
{
VLOG
(
3
)
<<
"The dense group["
<<
next_group_
<<
"] has no var to allreduce"
;
}
// Select common commstream to split tensors
// group.dense_contents_ ---> group.dense_tensors
group
.
SplitTensors
(
*
parallel_ctx_
->
GetDeviceContext
(
run_order
));
}
}
}
...
...
paddle/fluid/imperative/reducer.h
浏览文件 @
dae3e1f3
...
...
@@ -28,6 +28,7 @@
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/framework/variable.h"
#include "paddle/fluid/operators/math/math_function.h"
namespace
paddle
{
namespace
platform
{
...
...
@@ -133,7 +134,8 @@ class Reducer {
int
nrings_
=
1
;
// Following variables are to help rebuild group
bool
has_rebuilt_group_
{
false
};
// TODO(shenliang03): Support rebuild in the future.
bool
has_rebuilt_group_
{
true
};
std
::
vector
<
std
::
shared_ptr
<
imperative
::
VarBase
>>
rebuild_vars_
;
std
::
vector
<
int64_t
>
rebuild_var_indices_
;
const
std
::
vector
<
size_t
>
group_size_limits_
;
...
...
paddle/fluid/imperative/tests/test_group.cc
浏览文件 @
dae3e1f3
...
...
@@ -94,9 +94,11 @@ void GroupConcatSplit(Place place, size_t size) {
auto
*
dev_ctx
=
pool
.
Get
(
place
);
{
// concat
auto
*
tensor
=
group
.
dense_contents_
.
GetMutable
<
framework
::
LoDTensor
>
();
tensor
->
Resize
(
framework
::
make_ddim
({
group
.
all_length_
}))
.
mutable_data
(
place
,
group
.
dtype_
);
group
.
ConcatTensors
(
*
dev_ctx
);
auto
*
tensor
=
group
.
dense_contents_
.
GetMutable
<
framework
::
LoDTensor
>
();
framework
::
Tensor
tmp
;
framework
::
TensorCopySync
(
*
tensor
,
cpu_place
,
&
tmp
);
auto
*
data
=
tmp
.
data
<
T
>
();
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录