Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
298f210d
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
298f210d
编写于
5月 11, 2021
作者:
S
ShenLiang
提交者:
GitHub
5月 11, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Support control flow in DataParallel (#32826)
* fix find_unused_parameters default value
上级
a9e53050
变更
10
隐藏空白更改
内联
并排
Showing
10 changed file
with
95 addition
and
65 deletion
+95
-65
paddle/fluid/framework/distributed_strategy.proto
paddle/fluid/framework/distributed_strategy.proto
+1
-1
paddle/fluid/imperative/reducer.cc
paddle/fluid/imperative/reducer.cc
+64
-46
paddle/fluid/imperative/reducer.h
paddle/fluid/imperative/reducer.h
+6
-2
python/paddle/distributed/fleet/base/distributed_strategy.py
python/paddle/distributed/fleet/base/distributed_strategy.py
+1
-1
python/paddle/fluid/dygraph/parallel.py
python/paddle/fluid/dygraph/parallel.py
+6
-9
python/paddle/fluid/tests/unittests/parallel_dygraph_gradient_check.py
.../fluid/tests/unittests/parallel_dygraph_gradient_check.py
+2
-2
python/paddle/fluid/tests/unittests/spawn_runner_base.py
python/paddle/fluid/tests/unittests/spawn_runner_base.py
+1
-0
python/paddle/fluid/tests/unittests/test_dist_base.py
python/paddle/fluid/tests/unittests/test_dist_base.py
+7
-4
python/paddle/fluid/tests/unittests/test_parallel_dygraph_control_flow.py
...uid/tests/unittests/test_parallel_dygraph_control_flow.py
+6
-0
python/paddle/fluid/tests/unittests/test_parallel_dygraph_mnist.py
...ddle/fluid/tests/unittests/test_parallel_dygraph_mnist.py
+1
-0
未找到文件。
paddle/fluid/framework/distributed_strategy.proto
浏览文件 @
298f210d
...
...
@@ -172,7 +172,7 @@ message DistributedStrategy {
optional
bool
fp16_allreduce
=
25
[
default
=
false
];
optional
bool
sharding
=
26
[
default
=
false
];
optional
float
last_comm_group_size_MB
=
27
[
default
=
1
];
optional
bool
find_unused_parameters
=
28
[
default
=
tru
e
];
optional
bool
find_unused_parameters
=
28
[
default
=
fals
e
];
optional
bool
tensor_parallel
=
29
[
default
=
false
];
optional
bool
without_graph_optimization
=
30
[
default
=
false
];
...
...
paddle/fluid/imperative/reducer.cc
浏览文件 @
298f210d
...
...
@@ -297,7 +297,7 @@ Reducer::Reducer(const std::vector<std::shared_ptr<imperative::VarBase>> &vars,
is_sparse_gradient_
(
is_sparse_gradient
),
parallel_ctx_
(
parallel_ctx
),
group_size_limits_
(
group_size_limits
),
find_unused_vars_
(
find_unused_vars
)
{
find_unused_vars_
each_step_
(
find_unused_vars
)
{
VLOG
(
3
)
<<
"Start construct the Reducer ..."
;
nrings_
=
parallel_ctx
->
GetNRings
();
nranks_
=
parallel_ctx
->
GetNRanks
();
...
...
@@ -457,42 +457,8 @@ void Reducer::PrepareDeps(const std::unordered_set<GradOpNode *> &init_nodes) {
}
}
// After each batch is calculated, the counter of each group(group.pending_)
// and allreudce sequence counter(next_group_) will be cleaned up again.
void
Reducer
::
PrepareForBackward
(
void
Reducer
::
TraverseBackwardGraph
(
const
std
::
vector
<
std
::
shared_ptr
<
imperative
::
VarBase
>>
&
outputs
)
{
VLOG
(
3
)
<<
"after forward, then reset count for backward."
;
next_group_
=
0
;
std
::
for_each
(
groups_
.
begin
(),
groups_
.
end
(),
[](
Group
&
group
)
{
group
.
pending_
=
group
.
variable_indices_
.
size
();
group
.
sparse_contents_
=
nullptr
;
});
// reinitialize vars_marked_ready_ for next iteration
vars_marked_ready_
.
clear
();
vars_marked_ready_
.
resize
(
vars_
.
size
(),
false
);
PADDLE_ENFORCE_EQ
(
groups_need_finalize_
,
false
,
platform
::
errors
::
PreconditionNotMet
(
"A serious error has occurred here. There may be several reasons: "
"1) Please note that all forward outputs derived from the module "
"parameters must participate in the calculation of losses and "
"subsequent gradient calculations. If not, the wrapper will hang, "
"waiting for autograd to generate gradients for these parameters. "
"you can use detach or stop_gradient to make the unused parameters "
"detached from the autograd graph. "
"2) Used multiple forwards and one backward. You may be able to wrap "
"multiple forwards in a model."
));
// The first var to trigger the unused parameter
has_marked_unused_vars_
=
false
;
unused_vars_
.
clear
();
if
(
!
find_unused_vars_
)
{
return
;
}
node_deps_
.
clear
();
std
::
queue
<
std
::
shared_ptr
<
GradOpNode
>>
q
;
std
::
unordered_set
<
VariableWrapper
*>
var_visited
;
...
...
@@ -554,8 +520,50 @@ void Reducer::PrepareForBackward(
<<
"] is not used"
;
}
}
}
if
(
unused_vars_
.
empty
())
{
// After each batch is calculated, the counter of each group(group.pending_)
// and allreudce sequence counter(next_group_) will be cleaned up again.
void
Reducer
::
PrepareForBackward
(
const
std
::
vector
<
std
::
shared_ptr
<
imperative
::
VarBase
>>
&
outputs
)
{
VLOG
(
3
)
<<
"after forward, then reset count for backward."
;
next_group_
=
0
;
std
::
for_each
(
groups_
.
begin
(),
groups_
.
end
(),
[](
Group
&
group
)
{
group
.
pending_
=
group
.
variable_indices_
.
size
();
group
.
sparse_contents_
=
nullptr
;
});
// reinitialize vars_marked_ready_ for next iteration
vars_marked_ready_
.
clear
();
vars_marked_ready_
.
resize
(
vars_
.
size
(),
false
);
PADDLE_ENFORCE_EQ
(
groups_need_finalize_
,
false
,
platform
::
errors
::
PreconditionNotMet
(
"A serious error has occurred here. Please "
"set find_unused_parameters=True to traverse backward graph "
"in each step to prepare reduce in advance. If you have "
"set, There may be several reasons for this error: "
"1) Please note that all forward outputs derived from the module "
"parameters must participate in the calculation of losses and "
"subsequent gradient calculations. If not, the wrapper will hang, "
"waiting for autograd to generate gradients for these parameters. "
"you can use detach or stop_gradient to make the unused parameters "
"detached from the autograd graph. "
"2) Used multiple forwards and one backward. You may be able to wrap "
"multiple forwards in a model."
));
// The first var to trigger the unused parameter
has_marked_unused_vars_
=
false
;
if
(
find_unused_vars_once_
||
find_unused_vars_each_step_
)
{
unused_vars_
.
clear
();
TraverseBackwardGraph
(
outputs
);
// only check once in first step
find_unused_vars_once_
=
false
;
}
if
(
find_unused_vars_each_step_
&&
unused_vars_
.
empty
())
{
LOG_FIRST_N
(
WARNING
,
1
)
<<
"All parameters are involved in the backward pass. "
"It is recommended to set find_unused_parameters to False "
...
...
@@ -564,7 +572,9 @@ void Reducer::PrepareForBackward(
"will occur. Please make it clear that in the subsequent "
"training, there will be no parameters that are not used "
"in the backward pass, and then set find_unused_parameters"
;
}
else
if
(
unused_vars_
.
size
()
==
vars_
.
size
())
{
}
if
(
unused_vars_
.
size
()
==
vars_
.
size
())
{
LOG_FIRST_N
(
WARNING
,
1
)
<<
"There is no parameter in the device involved "
"in the backward calculation. If there are "
...
...
@@ -595,13 +605,13 @@ void Reducer::AddDistHook(size_t var_index) {
local_used_vars_
[
var_index
]
=
1
;
// rebuild group when find_unused_vars_ is false
// rebuild group when find_unused_vars_
each_step_
is false
if
(
NeedRebuildGroup
())
{
rebuild_vars_
.
push_back
(
vars_
[
var_index
]);
rebuild_var_indices_
.
push_back
(
var_index
);
}
if
(
!
has_marked_unused_vars_
&&
find_unused_vars_
)
{
if
(
!
has_marked_unused_vars_
)
{
has_marked_unused_vars_
=
true
;
for
(
const
auto
&
unused_index
:
unused_vars_
)
{
MarkVarReady
(
unused_index
,
false
);
...
...
@@ -622,7 +632,9 @@ void Reducer::MarkVarReady(const size_t var_index, const bool is_used_var) {
if
(
vars_marked_ready_
[
var_index
])
{
auto
error_info
=
string
::
Sprintf
(
"Error happened, when parameter[%d][%s] has been ready before. "
"There may be several reasons for this error: "
"Please set find_unused_parameters=True to traverse backward graph "
"in each step to prepare reduce in advance. If you have set, "
"there may be several reasons for this error: "
"1) In multiple reentrant backward phase, some parameters are reused."
"2) Using model parameters outside of forward function. Please "
"make sure that model parameters are not shared in concurrent "
...
...
@@ -690,10 +702,16 @@ void Reducer::MarkVarReady(const size_t var_index, const bool is_used_var) {
}
}
else
{
// process sparse group
PADDLE_ENFORCE_EQ
(
HasGrad
(
var_index
),
true
,
platform
::
errors
::
PreconditionNotMet
(
"The sparse parameter[%d][%s] must have a gradient"
,
var_index
,
vars_
[
var_index
]
->
Name
()));
PADDLE_ENFORCE_EQ
(
HasGrad
(
var_index
),
true
,
platform
::
errors
::
PreconditionNotMet
(
"The sparse parameter[%d][%s] should have gradient. "
"Currently, DataParallel does not support sparse "
"parameters without generating gradients during training. "
"For example, if is_sparese=True is used in Embedding, "
"the current step of this parameter cannot generate gradient "
"because of stop_gradient/detatch, where error will occur."
,
var_index
,
vars_
[
var_index
]
->
Name
()));
auto
var_base
=
vars_
[
var_index
]
->
GradVarBase
();
// need to check tensor type
PADDLE_ENFORCE_EQ
(
...
...
@@ -943,7 +961,7 @@ void Reducer::FinalizeBackward() {
InitializeGroups
(
group_indices_
);
}
if
(
find_unused_vars_
)
{
if
(
find_unused_vars_
each_step_
)
{
// TODO(liuyuhui) support xpu about Tensorcopy/TensorFromVector/TensorToVector
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
ProcessUnusedDenseVars
();
...
...
paddle/fluid/imperative/reducer.h
浏览文件 @
298f210d
...
...
@@ -162,13 +162,16 @@ class Reducer {
std
::
vector
<
std
::
vector
<
size_t
>>
RebuildGruops
();
inline
bool
NeedRebuildGroup
()
{
return
!
has_rebuilt_group_
&&
!
find_unused_vars_
;
return
!
has_rebuilt_group_
&&
!
find_unused_vars_
each_step_
;
}
void
ProcessUnusedDenseVars
();
bool
HasGrad
(
size_t
var_index
);
void
TraverseBackwardGraph
(
const
std
::
vector
<
std
::
shared_ptr
<
imperative
::
VarBase
>>&
outputs
);
private:
std
::
vector
<
std
::
shared_ptr
<
imperative
::
VarBase
>>
vars_
;
std
::
vector
<
std
::
vector
<
size_t
>>
group_indices_
;
...
...
@@ -195,7 +198,8 @@ class Reducer {
std
::
unordered_map
<
VariableWrapper
*
,
size_t
>
var_index_map_
;
std
::
vector
<
size_t
>
unused_vars_
;
bool
has_marked_unused_vars_
{
false
};
bool
find_unused_vars_
{
false
};
bool
find_unused_vars_each_step_
{
false
};
bool
find_unused_vars_once_
{
true
};
bool
groups_need_finalize_
{
false
};
#ifdef PADDLE_WITH_XPU_BKCL
// comm_pool_ is used for scheduling allreduce in multi Kunlun cards training.
...
...
python/paddle/distributed/fleet/base/distributed_strategy.py
浏览文件 @
298f210d
...
...
@@ -626,7 +626,7 @@ class DistributedStrategy(object):
Indicating whether we are using find_unused_parameters to
find unused parameters in DataParallel.
Default value:
Tru
e
Default value:
Fals
e
Examples:
...
...
python/paddle/fluid/dygraph/parallel.py
浏览文件 @
298f210d
...
...
@@ -417,14 +417,15 @@ class DataParallel(layers.Layer):
Note that setting the find_unused_parameters to True
will affect computing performance. Therefore, if all parameters
are sure to participate in the loss calculation and the
autograd graph construction, please set it False. Default:
Tru
e.
autograd graph construction, please set it False. Default:
Fals
e.
Returns:
Layer: The data paralleled module.
Examples:
.. code-block:: python
# required: distributed
import paddle
import paddle.nn as nn
import paddle.optimizer as opt
...
...
@@ -474,7 +475,7 @@ class DataParallel(layers.Layer):
strategy
=
None
,
comm_buffer_size
=
25
,
last_comm_buffer_size
=
1
,
find_unused_parameters
=
Tru
e
):
find_unused_parameters
=
Fals
e
):
super
(
DataParallel
,
self
).
__init__
(
layers
.
full_name
()
+
"_data_parallel"
)
...
...
@@ -576,12 +577,8 @@ class DataParallel(layers.Layer):
def
forward
(
self
,
*
inputs
,
**
kwargs
):
outputs
=
self
.
_layers
(
*
inputs
,
**
kwargs
)
if
self
.
_strategy
.
nranks
>
1
and
framework
.
_dygraph_tracer
().
_has_grad
:
if
self
.
find_unused_parameters
:
self
.
_reducer
.
prepare_for_backward
(
list
(
self
.
_find_varbase
(
outputs
)))
else
:
self
.
_reducer
.
prepare_for_backward
(
list
(
self
.
_find_varbase
([])))
self
.
_reducer
.
prepare_for_backward
(
list
(
self
.
_find_varbase
(
outputs
)))
return
outputs
@
deprecated
(
...
...
python/paddle/fluid/tests/unittests/parallel_dygraph_gradient_check.py
浏览文件 @
298f210d
...
...
@@ -74,8 +74,8 @@ class TestDistTraning(unittest.TestCase):
state_dict
=
model_a
.
state_dict
()
model_b
.
set_state_dict
(
state_dict
)
model_a
=
paddle
.
DataParallel
(
model_a
)
model_b
=
paddle
.
DataParallel
(
model_b
)
model_a
=
paddle
.
DataParallel
(
model_a
,
find_unused_parameters
=
True
)
model_b
=
paddle
.
DataParallel
(
model_b
,
find_unused_parameters
=
True
)
ones_input
=
paddle
.
ones
(
shape
=
(
batch
,
in_dim
))
ones_input
.
stop_gradient
=
True
...
...
python/paddle/fluid/tests/unittests/spawn_runner_base.py
浏览文件 @
298f210d
...
...
@@ -27,6 +27,7 @@ from test_dist_base import RUN_STEP
class
SpawnAssistTestArgs
(
object
):
update_method
=
"local"
trainer_id
=
0
find_unused_parameters
=
False
class
TestDistSpawnRunner
(
unittest
.
TestCase
):
...
...
python/paddle/fluid/tests/unittests/test_dist_base.py
浏览文件 @
298f210d
...
...
@@ -548,7 +548,10 @@ class TestParallelDyGraphRunnerBase(object):
# 4. train model
model
,
train_reader
,
opt
=
self
.
get_model
()
if
args
.
update_method
==
"nccl2"
:
model
=
paddle
.
DataParallel
(
model
)
if
args
.
find_unused_parameters
:
model
=
paddle
.
DataParallel
(
model
,
find_unused_parameters
=
True
)
else
:
model
=
paddle
.
DataParallel
(
model
,
find_unused_parameters
=
False
)
out_losses
=
[]
for
step_id
,
data
in
enumerate
(
train_reader
()):
...
...
@@ -581,8 +584,8 @@ class TestParallelDyGraphRunnerBase(object):
# set strategy
strategy
=
fleet
.
DistributedStrategy
()
if
not
args
.
find_unused_parameters
:
strategy
.
find_unused_parameters
=
Fals
e
if
args
.
find_unused_parameters
:
strategy
.
find_unused_parameters
=
Tru
e
# 3. init parallel env
if
args
.
update_method
==
"nccl2"
or
"bkcl"
:
...
...
@@ -737,7 +740,7 @@ class TestDistBase(unittest.TestCase):
self
.
_save_model
=
False
self
.
_fuse_all_reduce
=
None
self
.
_accumulate_gradient
=
False
self
.
_find_unused_parameters
=
Tru
e
self
.
_find_unused_parameters
=
Fals
e
self
.
_setup_config
()
global
DIST_UT_PORT
...
...
python/paddle/fluid/tests/unittests/test_parallel_dygraph_control_flow.py
浏览文件 @
298f210d
...
...
@@ -30,6 +30,7 @@ class TestDygraphControlFlowSame(TestDistBase):
self
.
_sync_mode
=
False
self
.
_nccl2_mode
=
True
self
.
_dygraph
=
True
self
.
_find_unused_parameters
=
True
def
test_net
(
self
):
if
fluid
.
core
.
is_compiled_with_cuda
():
...
...
@@ -46,6 +47,7 @@ class TestFleetDygraphControlFlowSame(TestDygraphControlFlowSame):
self
.
_nccl2_mode
=
True
self
.
_dygraph
=
True
self
.
_use_fleet_api
=
True
self
.
_find_unused_parameters
=
True
class
TestFleetDygraphControlFlowSameAccGrad
(
TestDygraphControlFlowSame
):
...
...
@@ -54,6 +56,7 @@ class TestFleetDygraphControlFlowSameAccGrad(TestDygraphControlFlowSame):
self
.
_nccl2_mode
=
True
self
.
_dygraph
=
True
self
.
_accumulate_gradient
=
True
self
.
_find_unused_parameters
=
True
class
TestDygraphControlFlowDiff
(
TestDistBase
):
...
...
@@ -61,6 +64,7 @@ class TestDygraphControlFlowDiff(TestDistBase):
self
.
_sync_mode
=
False
self
.
_nccl2_mode
=
True
self
.
_dygraph
=
True
self
.
_find_unused_parameters
=
True
def
test_net
(
self
):
if
fluid
.
core
.
is_compiled_with_cuda
():
...
...
@@ -77,6 +81,7 @@ class TestFleetDygraphControlFlowDiff(TestDygraphControlFlowDiff):
self
.
_nccl2_mode
=
True
self
.
_dygraph
=
True
self
.
_use_fleet_api
=
True
self
.
_find_unused_parameters
=
True
class
TestFleetDygraphControlFlowDiffAccGrad
(
TestDygraphControlFlowDiff
):
...
...
@@ -85,6 +90,7 @@ class TestFleetDygraphControlFlowDiffAccGrad(TestDygraphControlFlowDiff):
self
.
_nccl2_mode
=
True
self
.
_dygraph
=
True
self
.
_accumulate_gradient
=
True
self
.
_find_unused_parameters
=
True
if
__name__
==
"__main__"
:
...
...
python/paddle/fluid/tests/unittests/test_parallel_dygraph_mnist.py
浏览文件 @
298f210d
...
...
@@ -31,6 +31,7 @@ class TestParallelDygraphMnist(TestDistBase):
self
.
_sync_mode
=
False
self
.
_nccl2_mode
=
True
self
.
_dygraph
=
True
self
.
_find_unused_parameters
=
True
def
test_mnist
(
self
):
if
fluid
.
core
.
is_compiled_with_cuda
():
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录