Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
a96d54ac
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
a96d54ac
编写于
7月 29, 2020
作者:
D
Dong Daxiang
提交者:
GitHub
7月 29, 2020
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Generate final strategy (#25782)
* refine strategy compiler and meta optimizers make async as a_sync
上级
0ca1bb56
变更
9
显示空白变更内容
内联
并排
Showing
9 changed file
with
131 addition
and
46 deletion
+131
-46
paddle/fluid/framework/distributed_strategy.proto
paddle/fluid/framework/distributed_strategy.proto
+2
-2
python/paddle/fleet/base/distributed_strategy.py
python/paddle/fleet/base/distributed_strategy.py
+21
-17
python/paddle/fleet/base/fleet_base.py
python/paddle/fleet/base/fleet_base.py
+14
-7
python/paddle/fleet/base/strategy_compiler.py
python/paddle/fleet/base/strategy_compiler.py
+41
-5
python/paddle/fleet/meta_optimizers/graph_execution_optimizer.py
...paddle/fleet/meta_optimizers/graph_execution_optimizer.py
+9
-0
python/paddle/fleet/meta_optimizers/meta_optimizer_base.py
python/paddle/fleet/meta_optimizers/meta_optimizer_base.py
+3
-0
python/paddle/fleet/meta_optimizers/recompute_optimizer.py
python/paddle/fleet/meta_optimizers/recompute_optimizer.py
+6
-1
python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py
.../fluid/tests/unittests/test_fleet_distributed_strategy.py
+12
-12
python/paddle/fluid/tests/unittests/test_fleet_meta_optimizer.py
...paddle/fluid/tests/unittests/test_fleet_meta_optimizer.py
+23
-2
未找到文件。
paddle/fluid/framework/distributed_strategy.proto
浏览文件 @
a96d54ac
...
...
@@ -85,7 +85,7 @@ message DistributedStrategy {
optional
bool
pipeline
=
9
[
default
=
false
];
optional
bool
elastic
=
10
[
default
=
false
];
optional
bool
auto
=
11
[
default
=
false
];
optional
bool
async
=
12
[
default
=
true
];
optional
bool
a
_
sync
=
12
[
default
=
true
];
optional
bool
sync_nccl_allreduce
=
13
[
default
=
true
];
optional
int32
nccl_comm_num
=
14
[
default
=
1
];
optional
bool
use_hierarchical_allreduce
=
15
[
default
=
false
];
...
...
@@ -99,7 +99,7 @@ message DistributedStrategy {
optional
LocalSGDConfig
localsgd_configs
=
103
;
optional
GradientMergeConfig
gradient_merge_configs
=
104
;
optional
PipelineConfig
pipeline_configs
=
106
;
optional
AsyncConfig
async_configs
=
107
;
optional
AsyncConfig
a
_
sync_configs
=
107
;
optional
BuildStrategy
build_strategy
=
201
;
optional
ExecutionStrategy
execution_strategy
=
202
;
...
...
python/paddle/fleet/base/distributed_strategy.py
浏览文件 @
a96d54ac
...
...
@@ -201,7 +201,7 @@ class DistributedStrategy(object):
f
.
name
).
extend
(
getattr
(
strategy
,
f
.
name
))
@
property
def
a
sync_update
(
self
):
def
a
_sync
(
self
):
"""
Indicating whether we are using asynchronous stocastic gradient descent updates
for training. This property is valid when we are using parameter server training,
...
...
@@ -216,29 +216,29 @@ class DistributedStrategy(object):
fleet.init(role_maker)
strategy = fleet.DistributedStrategy()
strategy.a
sync_update
= True # by default this is True
strategy.a
_sync
= True # by default this is True
# code block for defining loss and local optimizer
# sgd = fleet.distributed_optimizer(optimizer, strategy)
"""
return
self
.
strategy
.
async
return
self
.
strategy
.
a
_
sync
@
a
sync_update
.
setter
def
a
sync_update
(
self
,
flag
):
@
a
_sync
.
setter
def
a
_sync
(
self
,
flag
):
if
isinstance
(
flag
,
bool
):
self
.
strategy
.
async
=
flag
self
.
strategy
.
a
_
sync
=
flag
else
:
print
(
"WARNING: a
sync_update
should have value of bool type"
)
print
(
"WARNING: a
_sync
should have value of bool type"
)
@
property
def
a
sync_update
_configs
(
self
):
def
a
_sync
_configs
(
self
):
"""
Set async update configurations. In general, asynchronous parameter server
Set a
_
sync update configurations. In general, asynchronous parameter server
training has serveral configurable settings that can be configured through
a dict.
**Notes**:
**Detailed arguments for a
sync_update
_configs**
**Detailed arguments for a
_sync
_configs**
**k_step**: number of local optimization updates before communication
**max_merge_var_num**: maximum number of merged gradients before communication
**send_queue_size**: a buffer size of worker communication
...
...
@@ -255,19 +255,20 @@ class DistributedStrategy(object):
fleet.init(role_maker)
strategy = fleet.DistributedStrategy()
strategy.a
sync_update
= True # by default this is True
strategy.a
_sync
= True # by default this is True
configs = {"k_step": 10000, "send_queue_size": 32}
strategy.a
sync_update
_configs = configs
strategy.a
_sync
_configs = configs
# code block for defining loss and local optimizer
# sgd = fleet.distributed_optimizer(optimizer, strategy)
"""
return
get_msg_dict
(
self
.
strategy
.
async_configs
)
return
get_msg_dict
(
self
.
strategy
.
a
_
sync_configs
)
@
async_update_configs
.
setter
def
async_update_configs
(
self
,
configs
):
check_configs_key
(
self
.
strategy
.
async_configs
,
configs
,
"async_configs"
)
assign_configs_value
(
self
.
strategy
.
async_configs
,
configs
)
@
a_sync_configs
.
setter
def
a_sync_configs
(
self
,
configs
):
check_configs_key
(
self
.
strategy
.
a_sync_configs
,
configs
,
"a_sync_configs"
)
assign_configs_value
(
self
.
strategy
.
a_sync_configs
,
configs
)
@
property
def
amp
(
self
):
...
...
@@ -584,4 +585,7 @@ class DistributedStrategy(object):
print
(
"WARNING: auto should have value of bool type"
)
def
__repr__
(
self
):
fields
=
self
.
strategy
.
DESCRIPTOR
.
fields
for
f
in
fields
:
print
(
"{}: {}"
.
format
(
f
.
name
,
f
.
default_value
))
return
str
(
self
.
strategy
)
python/paddle/fleet/base/fleet_base.py
浏览文件 @
a96d54ac
...
...
@@ -228,6 +228,7 @@ class Fleet(object):
"""
self
.
user_defined_optimizer
=
optimizer
self
.
user_defined_strategy
=
strategy
self
.
valid_strategy
=
None
return
self
def
minimize
(
self
,
...
...
@@ -292,8 +293,10 @@ class Fleet(object):
distributed_optimizer_list
=
\
MetaOptimizerFactory
().
_get_valid_meta_optimizers
(
self
.
user_defined_optimizer
)
valid_optimizer_list
=
[]
valid_graph_optimizer_list
=
[]
can_not_apply_optimizer_list
=
[]
# recall meta optimizers for ranking
for
opt
in
distributed_optimizer_list
:
opt
.
_set_basic_info
(
loss
,
self
.
_role_maker
,
...
...
@@ -301,15 +304,21 @@ class Fleet(object):
self
.
user_defined_strategy
)
if
opt
.
_can_apply
()
and
not
opt
.
_is_graph_out
():
valid_optimizer_list
.
append
(
opt
)
if
opt
.
_can_apply
()
and
opt
.
_is_graph_out
():
el
if
opt
.
_can_apply
()
and
opt
.
_is_graph_out
():
valid_graph_optimizer_list
.
append
(
opt
)
else
:
can_not_apply_optimizer_list
.
append
(
opt
)
# combine recalled meta optimizers to be a valid meta optimizer
meta_optimizer
,
graph_optimizer
,
final_dist_strategy
=
\
meta_optimizer
,
graph_optimizer
=
\
self
.
strategy_compiler
.
generate_optimizer
(
loss
,
self
.
_role_maker
,
self
.
user_defined_optimizer
,
self
.
user_defined_strategy
,
valid_optimizer_list
,
valid_graph_optimizer_list
)
valid_strategy
=
self
.
strategy_compiler
.
_get_valid_strategy
(
self
.
user_defined_strategy
,
can_not_apply_optimizer_list
)
self
.
valid_strategy
=
valid_strategy
optimize_ops
=
[]
params_grads
=
[]
if
meta_optimizer
:
...
...
@@ -332,12 +341,10 @@ class Fleet(object):
if
self
.
_runtime_handle
is
None
:
self
.
_runtime_handle
=
RuntimeFactory
().
_create_runtime
(
final_dist_strategy
,
self
.
_role_maker
,
optimize_ops
,
params_grads
)
valid_strategy
,
self
.
_role_maker
,
optimize_ops
,
params_grads
)
if
self
.
_util
is
None
:
self
.
_util
=
UtilFactory
().
_create_util
(
final_dist_strategy
,
self
.
_role_maker
,
optimize_ops
,
params_grads
)
self
.
_util
=
UtilFactory
().
_create_util
(
valid_strategy
,
self
.
_role_maker
,
optimize_ops
,
params_grads
)
return
optimize_ops
,
params_grads
python/paddle/fleet/base/strategy_compiler.py
浏览文件 @
a96d54ac
...
...
@@ -30,7 +30,7 @@ def maximum_path_len_algo(optimizer_list):
return
None
for
idx
,
opt
in
enumerate
(
candidates
[
max_idx
][:
-
1
]):
opt
.
_update_inner_optimizer
(
candidates
[
max_idx
][
idx
+
1
])
return
candidates
[
max_idx
]
[
0
]
return
candidates
[
max_idx
]
class
StrategyCompilerBase
(
object
):
...
...
@@ -51,19 +51,55 @@ class StrategyCompiler(StrategyCompilerBase):
def
__init__
(
self
):
super
(
StrategyCompiler
,
self
).
__init__
()
self
.
_meta_optimizer
=
None
self
.
_graph_optimizer
=
None
self
.
_valid_optimizer_list
=
None
self
.
_user_defined_strategy
=
None
self
.
_meta_optimizer_candidates
=
[]
self
.
_graph_optimizer_candidates
=
[]
def
_get_valid_strategy
(
self
,
dist_strategy
,
can_not_apply_optimizer_list
):
import
copy
valid_strategy
=
copy
.
copy
(
dist_strategy
)
invalid_optimizers
=
[]
for
candidate
in
self
.
_meta_optimizer_candidates
:
is_valid
=
False
for
valid
in
self
.
_meta_optimizers
:
if
candidate
.
__class__
.
__name__
==
valid
.
__class__
.
__name__
:
is_valid
=
True
break
if
not
is_valid
:
invalid_optimizers
.
append
(
candidate
)
for
opt
in
invalid_optimizers
:
opt
.
_disable_strategy
(
valid_strategy
)
for
opt
in
can_not_apply_optimizer_list
:
opt
.
_disable_strategy
(
valid_strategy
)
return
valid_strategy
def
generate_optimizer
(
self
,
loss
,
role_maker
,
optimizer
,
user
d
_defined_strategy
,
meta_optimizer_list
,
user_defined_strategy
,
meta_optimizer_list
,
graph_optimizer_list
):
self
.
_user_defined_strategy
=
user_defined_strategy
self
.
_meta_optimizer_candidates
=
meta_optimizer_list
self
.
_graph_optimizer_candidates
=
graph_optimizer_list
if
len
(
meta_optimizer_list
)
==
0
and
len
(
graph_optimizer_list
)
==
0
:
return
optimizer
,
None
else
:
# currently, we use heuristic algorithm to select
# meta optimizers combinations
meta_optimizer
=
maximum_path_len_algo
(
meta_optimizer_list
)
graph_optimizer
=
maximum_path_len_algo
(
graph_optimizer_list
)
meta_optimizer
s
=
maximum_path_len_algo
(
meta_optimizer_list
)
graph_optimizer
s
=
maximum_path_len_algo
(
graph_optimizer_list
)
# should design a distributed strategy update interface
# when we have finally decided the combination of meta_optimizer
# and graph_optimizer, the corresponding distributed strategy
# should be updated.
return
meta_optimizer
,
graph_optimizer
,
None
self
.
_meta_optimizers
=
meta_optimizers
self
.
_graph_optimizers
=
graph_optimizers
return_meta
=
None
if
meta_optimizers
==
None
else
meta_optimizers
[
0
]
return_graph
=
None
if
graph_optimizers
==
None
else
graph_optimizers
[
0
]
return
return_meta
,
return_graph
python/paddle/fleet/meta_optimizers/graph_execution_optimizer.py
浏览文件 @
a96d54ac
...
...
@@ -16,6 +16,7 @@ from paddle.fluid.framework import core
from
paddle.fluid
import
compiler
from
.meta_optimizer_base
import
MetaOptimizerBase
from
..base.private_helper_function
import
wait_server_ready
import
logging
class
GraphExecutionOptimizer
(
MetaOptimizerBase
):
...
...
@@ -32,6 +33,10 @@ class GraphExecutionOptimizer(MetaOptimizerBase):
"""
Basically, this is PE, and almost all programs can be executed here
"""
if
not
self
.
role_maker
.
_is_collective
:
# update me. currently, if parameter server is used
# graph execution optimizer can not be applied
return
False
return
True
def
backward
(
self
,
...
...
@@ -178,6 +183,10 @@ class GraphExecutionOptimizer(MetaOptimizerBase):
return
self
.
_compiled_program
def
_disable_strategy
(
self
,
dist_strategy
):
# TODO(guru4elephant): should close all PE related flags here
pass
def
minimize
(
self
,
loss
,
startup_program
=
None
,
...
...
python/paddle/fleet/meta_optimizers/meta_optimizer_base.py
浏览文件 @
a96d54ac
...
...
@@ -39,6 +39,9 @@ class MetaOptimizerBase(object):
if
str
(
optimizer
.
__class__
.
__name__
)
in
self
.
meta_optimizers_white_list
:
return
True
def
_disable_strategy
(
self
,
dist_strategy
):
raise
NotImplementedError
(
"you should implement disable strategy"
)
def
minimize_impl
(
self
,
loss
,
startup_program
=
None
,
...
...
python/paddle/fleet/meta_optimizers/recompute_optimizer.py
浏览文件 @
a96d54ac
...
...
@@ -34,11 +34,16 @@ class RecomputeOptimizer(MetaOptimizerBase):
def
_can_apply
(
self
):
if
self
.
user_defined_strategy
.
recompute
==
True
:
if
len
(
self
.
user_defined_strategy
.
recompute_checkpoints
)
==
0
:
if
len
(
self
.
user_defined_strategy
.
recompute_configs
[
"checkpoints"
])
==
0
:
return
False
else
:
return
True
def
_disable_strategy
(
self
,
dist_strategy
):
dist_strategy
.
recompute
=
False
dist_strategy
.
recompute_configs
=
{
"checkpoints"
:
[]}
def
backward
(
self
,
loss
,
startup_program
=
None
,
...
...
python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py
浏览文件 @
a96d54ac
...
...
@@ -178,20 +178,20 @@ class TestStrategyConfig(unittest.TestCase):
strategy
.
lamb
=
"True"
self
.
assertEqual
(
strategy
.
lamb
,
False
)
def
test_a
sync_update
(
self
):
def
test_a
_sync
(
self
):
strategy
=
paddle
.
fleet
.
DistributedStrategy
()
strategy
.
a
sync_update
=
True
self
.
assertEqual
(
strategy
.
a
sync_update
,
True
)
strategy
.
a
sync_update
=
False
self
.
assertEqual
(
strategy
.
a
sync_update
,
False
)
strategy
.
a
sync_update
=
"True"
self
.
assertEqual
(
strategy
.
a
sync_update
,
False
)
def
test_async_configs
(
self
):
strategy
.
a
_sync
=
True
self
.
assertEqual
(
strategy
.
a
_sync
,
True
)
strategy
.
a
_sync
=
False
self
.
assertEqual
(
strategy
.
a
_sync
,
False
)
strategy
.
a
_sync
=
"True"
self
.
assertEqual
(
strategy
.
a
_sync
,
False
)
def
test_a
_
sync_configs
(
self
):
strategy
=
paddle
.
fleet
.
DistributedStrategy
()
configs
=
{
"k_steps"
:
1000
}
strategy
.
a
sync_update
_configs
=
configs
self
.
assertEqual
(
strategy
.
a
sync_update
_configs
[
"k_steps"
],
1000
)
strategy
.
a
_sync
_configs
=
configs
self
.
assertEqual
(
strategy
.
a
_sync
_configs
[
"k_steps"
],
1000
)
def
test_elastic
(
self
):
strategy
=
paddle
.
fleet
.
DistributedStrategy
()
...
...
@@ -213,7 +213,7 @@ class TestStrategyConfig(unittest.TestCase):
def
test_strategy_prototxt
(
self
):
strategy
=
paddle
.
fleet
.
DistributedStrategy
()
strategy
.
a
sync_update
=
True
strategy
.
a
_sync
=
True
strategy
.
localsgd
=
True
strategy
.
dgc
=
True
localsgd_configs
=
{
"k_steps"
:
5
}
...
...
python/paddle/fluid/tests/unittests/test_fleet_meta_optimizer.py
浏览文件 @
a96d54ac
...
...
@@ -25,6 +25,27 @@ class TestFleetMetaOptimizer(unittest.TestCase):
os
.
environ
[
"PADDLE_PSERVERS_IP_PORT_LIST"
]
=
\
"127.0.0.1:36001,127.0.0.2:36001"
def
test_graph_execution_optimizer_not_apply
(
self
):
import
paddle.fleet
as
fleet
import
paddle.fluid.incubate.fleet.base.role_maker
as
role_maker
role
=
role_maker
.
PaddleCloudRoleMaker
()
fleet
.
init
(
role
)
input_x
=
paddle
.
fluid
.
layers
.
data
(
name
=
"x"
,
shape
=
[
32
],
dtype
=
'float32'
)
input_y
=
paddle
.
fluid
.
layers
.
data
(
name
=
"y"
,
shape
=
[
1
],
dtype
=
'int64'
)
fc_1
=
paddle
.
fluid
.
layers
.
fc
(
input
=
input_x
,
size
=
64
,
act
=
'tanh'
)
fc_2
=
paddle
.
fluid
.
layers
.
fc
(
input
=
fc_1
,
size
=
64
,
act
=
'tanh'
)
prediction
=
paddle
.
fluid
.
layers
.
fc
(
input
=
[
fc_2
],
size
=
2
,
act
=
'softmax'
)
cost
=
paddle
.
fluid
.
layers
.
cross_entropy
(
input
=
prediction
,
label
=
input_y
)
avg_cost
=
paddle
.
fluid
.
layers
.
mean
(
x
=
cost
)
strategy
=
paddle
.
fleet
.
DistributedStrategy
()
optimizer
=
paddle
.
optimizer
.
SGD
(
learning_rate
=
0.01
)
optimizer
=
fleet
.
distributed_optimizer
(
optimizer
,
strategy
=
strategy
)
optimizer
.
minimize
(
avg_cost
)
def
test_graph_execution_optimizer
(
self
):
import
paddle.fleet
as
fleet
import
paddle.fluid.incubate.fleet.base.role_maker
as
role_maker
...
...
@@ -42,7 +63,7 @@ class TestFleetMetaOptimizer(unittest.TestCase):
avg_cost
=
paddle
.
fluid
.
layers
.
mean
(
x
=
cost
)
strategy
=
paddle
.
fleet
.
DistributedStrategy
()
strategy
.
nccl_comm_num
=
2
optimizer
=
paddle
.
optimizer
.
SGD
(
learning_rate
=
0.01
)
optimizer
=
fleet
.
distributed_optimizer
(
optimizer
,
strategy
=
strategy
)
optimizer
.
minimize
(
avg_cost
)
...
...
@@ -65,7 +86,7 @@ class TestFleetMetaOptimizer(unittest.TestCase):
strategy
=
paddle
.
fleet
.
DistributedStrategy
()
strategy
.
recompute
=
True
strategy
.
recompute_c
heckpoints
=
[
fc_2
]
strategy
.
recompute_c
onfigs
=
{
"checkpoints"
:
[
"fc2"
]}
optimizer
=
paddle
.
optimizer
.
SGD
(
learning_rate
=
0.01
)
optimizer
=
fleet
.
distributed_optimizer
(
optimizer
,
strategy
=
strategy
)
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录