Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
29d87812
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
29d87812
编写于
8月 12, 2019
作者:
G
gongweibao
提交者:
GitHub
8月 12, 2019
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Polish fleet API to support cuda collective mode and nccl2 mode. (#18966)
Polish fleet API to support cuda collective mode and nccl2 mode
上级
b7e1a1d7
变更
14
隐藏空白更改
内联
并排
Showing
14 changed file
with
363 addition
and
162 deletion
+363
-162
paddle/fluid/framework/details/all_reduce_op_handle.cc
paddle/fluid/framework/details/all_reduce_op_handle.cc
+1
-0
paddle/fluid/operators/distributed/grpc/grpc_client.cc
paddle/fluid/operators/distributed/grpc/grpc_client.cc
+1
-1
paddle/fluid/platform/device_context.cc
paddle/fluid/platform/device_context.cc
+4
-2
python/paddle/fluid/framework.py
python/paddle/fluid/framework.py
+2
-0
python/paddle/fluid/incubate/fleet/base/fleet_base.py
python/paddle/fluid/incubate/fleet/base/fleet_base.py
+8
-0
python/paddle/fluid/incubate/fleet/base/role_maker.py
python/paddle/fluid/incubate/fleet/base/role_maker.py
+25
-5
python/paddle/fluid/incubate/fleet/collective/__init__.py
python/paddle/fluid/incubate/fleet/collective/__init__.py
+164
-137
python/paddle/fluid/incubate/fleet/parameter_server/distribute_transpiler/__init__.py
.../fleet/parameter_server/distribute_transpiler/__init__.py
+8
-0
python/paddle/fluid/optimizer.py
python/paddle/fluid/optimizer.py
+1
-0
python/paddle/fluid/tests/unittests/CMakeLists.txt
python/paddle/fluid/tests/unittests/CMakeLists.txt
+16
-12
python/paddle/fluid/tests/unittests/dist_mnist.py
python/paddle/fluid/tests/unittests/dist_mnist.py
+10
-2
python/paddle/fluid/tests/unittests/test_dist_base.py
python/paddle/fluid/tests/unittests/test_dist_base.py
+86
-1
python/paddle/fluid/tests/unittests/test_dist_mnist_fleetapi.py
.../paddle/fluid/tests/unittests/test_dist_mnist_fleetapi.py
+35
-0
python/paddle/fluid/transpiler/distribute_transpiler.py
python/paddle/fluid/transpiler/distribute_transpiler.py
+2
-2
未找到文件。
paddle/fluid/framework/details/all_reduce_op_handle.cc
浏览文件 @
29d87812
...
...
@@ -22,6 +22,7 @@
// asynchronous nccl allreduce or synchronous issue:
// https://github.com/PaddlePaddle/Paddle/issues/15049
// If you want to change this default value, why?(gongwb)
DEFINE_bool
(
sync_nccl_allreduce
,
true
,
"If set true, will call `cudaStreamSynchronize(nccl_stream)`"
...
...
paddle/fluid/operators/distributed/grpc/grpc_client.cc
浏览文件 @
29d87812
...
...
@@ -449,7 +449,7 @@ void GRPCClient::Proceed() {
// destructed at this moment.
if
(
FLAGS_v
>=
3
)
{
std
::
string
msg
(
"GRPCClient Proceed end"
);
fwrite
(
msg
.
c_str
(),
msg
.
length
(),
1
,
std
out
);
fwrite
(
msg
.
c_str
(),
msg
.
length
(),
1
,
std
err
);
}
}
...
...
paddle/fluid/platform/device_context.cc
浏览文件 @
29d87812
...
...
@@ -32,8 +32,10 @@ platform::DeviceContext* DeviceContextPool::Get(const platform::Place& place) {
auto
it
=
device_contexts_
.
find
(
place
);
if
(
it
==
device_contexts_
.
end
())
{
PADDLE_THROW
(
"Place %s is not supported, Please re-compile with WITH_GPU "
"option"
,
"Place %s is not supported, Please check that your paddle compiles "
"with WITH_GPU "
"option or check that your train process hold the correct gpu_id if "
"you use Executor"
,
place
);
}
return
it
->
second
.
get
().
get
();
...
...
python/paddle/fluid/framework.py
浏览文件 @
29d87812
...
...
@@ -2848,6 +2848,8 @@ class Program(object):
# use Deep gradient comrepssion or not
self
.
_enable_dgc
=
False
self
.
_use_lamb
=
False
self
.
_nccl_comm_num
=
1
self
.
_use_hierarchical_allreduce
=
False
self
.
_hierarchical_allreduce_inter_nranks
=
0
...
...
python/paddle/fluid/incubate/fleet/base/fleet_base.py
浏览文件 @
29d87812
...
...
@@ -232,6 +232,14 @@ class Fleet(object):
def
save_persistables
(
self
,
executor
,
dirname
,
main_program
=
None
):
pass
@
abc
.
abstractmethod
def
node_num
(
self
):
pass
@
abc
.
abstractmethod
def
node_id
(
self
):
pass
class
DistributedOptimizer
(
object
):
"""
...
...
python/paddle/fluid/incubate/fleet/base/role_maker.py
浏览文件 @
29d87812
...
...
@@ -350,7 +350,7 @@ class PaddleCloudRoleMaker(RoleMakerBase):
for
i
,
ip
in
enumerate
(
self
.
pserver_ips
.
split
(
","
)):
eplist
.
append
(
':'
.
join
([
ip
,
ports
[
i
]]))
self
.
endpoints
=
","
.
join
(
eplist
)
self
.
_trainers
=
int
(
os
.
getenv
(
"PADDLE_TRAINERS_NUM"
,
"1"
))
self
.
_trainers
_num
=
int
(
os
.
getenv
(
"PADDLE_TRAINERS_NUM"
,
"1"
))
# ip of current node, either a worker or a pserver
current_ip
=
os
.
getenv
(
"POD_IP"
,
""
)
if
current_ip
==
""
:
...
...
@@ -380,11 +380,31 @@ class PaddleCloudRoleMaker(RoleMakerBase):
assert
(
self
.
_training_role
==
"TRAINER"
)
self
.
_worker_endpoints
=
os
.
getenv
(
"PADDLE_TRAINER_ENDPOINTS"
)
self
.
_current_endpoint
=
os
.
getenv
(
"PADDLE_CURRENT_ENDPOINT"
)
if
self
.
_worker_endpoints
:
self
.
_worker_endpoints
=
self
.
_worker_endpoints
.
split
(
","
)
self
.
_num_trainers
=
len
(
self
.
_worker_endpoints
)
assert
self
.
_worker_endpoints
is
not
None
,
"can't find PADDLE_TRAINER_ENDPOINTS"
self
.
_worker_endpoints
=
self
.
_worker_endpoints
.
split
(
","
)
self
.
_trainers_num
=
len
(
self
.
_worker_endpoints
)
self
.
_node_ips
=
self
.
_get_node_ips_from_endpoints
(
self
.
_worker_endpoints
)
self
.
_node_ip
=
self
.
_current_endpoint
.
split
(
":"
)[
0
].
strip
()
self
.
_node_num
=
len
(
self
.
_node_ips
)
self
.
_node_id
=
self
.
_node_ips
.
index
(
self
.
_node_ip
)
self
.
_role_is_generated
=
True
def
_get_node_ips_from_endpoints
(
self
,
endpoints
):
ss
=
set
()
ips
=
[]
for
ep
in
endpoints
:
ip
=
ep
.
split
(
":"
)[
0
].
strip
()
if
ip
not
in
ss
:
ss
.
add
(
ip
)
ips
.
append
(
ip
)
else
:
continue
return
ips
def
get_pserver_endpoints
(
self
):
if
not
self
.
_role_is_generated
:
self
.
generate_role
()
...
...
@@ -418,7 +438,7 @@ class PaddleCloudRoleMaker(RoleMakerBase):
def
worker_num
(
self
):
if
not
self
.
_role_is_generated
:
self
.
generate_role
()
return
self
.
_trainers
return
self
.
_trainers
_num
class
UserDefinedRoleMaker
(
RoleMakerBase
):
...
...
python/paddle/fluid/incubate/fleet/collective/__init__.py
浏览文件 @
29d87812
...
...
@@ -21,60 +21,20 @@ from paddle.fluid.incubate.fleet.base.fleet_base import Fleet
from
paddle.fluid.incubate.fleet.base.fleet_base
import
Mode
from
paddle.fluid.incubate.fleet.base.fleet_base
import
DistributedOptimizer
from
paddle.fluid
import
compiler
class
DistributedStrategy
(
object
):
import
os
import
sys
class
LambConfig
(
object
):
def
__init__
(
self
):
# precision configs
self
.
use_fp16
=
False
self
.
use_fp32
=
True
# algorithmic communication
self
.
local_sgd
=
False
self
.
dgc
=
False
# communication topology configs
self
.
h_allreduce
=
False
def
build
(
self
):
self
.
strategy_map
=
{}
# make sure we set single precision config True
if
self
.
use_fp32
and
self
.
use_fp16
:
self
.
use_fp16
=
False
# make sure we set single algorithmic communication True
if
self
.
local_sgd
and
self
.
dgc
:
self
.
local_sgd
=
False
self
.
strategy_map
[
"fp16"
]
=
self
.
use_fp16
self
.
strategy_map
[
"fp32"
]
=
self
.
use_fp32
self
.
strategy_map
[
"localsgd"
]
=
self
.
local_sgd
self
.
strategy_map
[
"dgc"
]
=
self
.
dgc
self
.
strategy_map
[
"h_allreduce"
]
=
self
.
h_allreduce
class
DistributedOptimizerFactory
(
object
):
pass
class
DistFCConfig
(
object
):
def
__init__
(
self
):
self
.
strategy_to_optimizer_map
()
def
strategy_to_optimizer_map
(
self
):
pattern
=
{}
pattern
[
"fp16"
]
=
[
"FP16SGDOptimizer"
,
"FP16LocalSGDOptimizer"
]
pattern
[
"fp32"
]
=
[
"FP32SGDOptimizer"
,
"FP32LocalSGDOptimizer"
]
pattern
[
"localsgd"
]
=
[
"FP16LocalSGDOptimizer"
,
"FP32LocalSGDOptimizer"
]
pattern
[
"h_allreduce"
]
=
[
"FP32SGDOptimizer"
,
"FP32LocalSGDOptimizer"
,
"FP16SGDOptimizer"
,
"FP16LocalSGDOptimizer"
,
]
self
.
pattern
=
pattern
def
create_by_strategy
(
self
,
optimizer
,
strategy
):
if
strategy
==
None
:
strategy
=
DistributedStrategy
()
strategy
.
build
()
strategy_list
=
[]
for
key
in
strategy
.
strategy_map
:
if
strategy
.
strategy_map
[
key
]:
strategy_list
.
append
(
self
.
pattern
[
key
])
classname
=
list
(
set
.
intersection
(
*
map
(
set
,
strategy_list
)))[
0
]
return
globals
()[
classname
](
optimizer
,
strategy
)
pass
class
Collective
(
Fleet
):
...
...
@@ -82,6 +42,10 @@ class Collective(Fleet):
super
(
Collective
,
self
).
__init__
(
Mode
.
COLLECTIVE
)
self
.
_local_ip
=
0
self
.
startup_program
=
None
self
.
_origin_program
=
None
self
.
main_program
=
None
def
init_worker
(
self
):
logging
.
warn
(
"You should not call 'init_worker' method for collective mode."
)
...
...
@@ -103,10 +67,8 @@ class Collective(Fleet):
"You should not call 'stop_worker' method for collective mode."
)
def
distributed_optimizer
(
self
,
optimizer
,
strategy
=
None
):
optimizer_factory
=
DistributedOptimizerFactory
()
self
.
_optimizer
=
\
optimizer_factory
.
create_by_strategy
(
optimizer
,
strategy
)
CollectiveOptimizer
(
optimizer
,
strategy
)
return
self
.
_optimizer
def
save_inference_model
(
self
,
...
...
@@ -117,16 +79,56 @@ class Collective(Fleet):
main_program
=
None
,
export_for_deployment
=
True
):
io
.
save_inference_model
(
dirname
,
feeded_var_names
,
target_vars
,
self
.
_
executor
,
main_program
,
None
,
None
,
executor
,
main_program
,
None
,
None
,
export_for_deployment
)
def
save_persistables
(
self
,
executor
,
dirname
,
main_program
=
None
):
io
.
save_persistables
(
self
.
_executor
,
dirname
,
main_program
,
None
)
io
.
save_persistables
(
executor
,
dirname
,
main_program
,
None
)
def
node_num
(
self
):
return
self
.
_role_maker
.
_node_num
def
node_id
(
self
):
return
self
.
_role_maker
.
_node_id
fleet
=
Collective
()
class
DistributedStrategy
(
fluid
.
BuildStrategy
):
"""
Init function of DistributedStrategy
"""
def
__init__
(
self
):
super
(
DistributedStrategy
,
self
).
__init__
()
self
.
fuse_memory_size
=
-
1
self
.
fuse_layer_size
=
1
self
.
use_local_sgd
=
False
self
.
use_dist_fc
=
False
self
.
local_sgd_config
=
None
# LocalSGDConfig
self
.
dist_fc_config
=
None
# DistFCConfig
self
.
mode
=
"nccl2"
# or collective
self
.
collective_mode
=
None
# local_sgd or grad_allreduce
self
.
nccl_comm_num
=
2
self
.
exec_strategy
=
fluid
.
ExecutionStrategy
()
sync_allreduce
=
os
.
getenv
(
"FLAGS_sync_nccl_allreduce"
)
if
sync_allreduce
==
"0"
:
self
.
_exec_strategy
.
num_threads
=
self
.
nccl_comm_num
+
1
if
sef
.
use_hierarchical_allreduce
:
self
.
_exec_strategy
.
num_threads
=
2
*
self
.
nccl_comm_num
+
1
if
self
.
_exec_strategy
.
num_threads
>
4
:
print
(
sys
.
stderr
,
"WARNING: if you use use_hierarchical_allreduce or "
"with multi nccl comm, please set FLAGS_sync_nccl_allreduce = 0"
)
class
CollectiveOpBasedOptimizer
(
DistributedOptimizer
):
"""
Collective Operator Base Class For Distributed Optimizer
...
...
@@ -134,6 +136,9 @@ class CollectiveOpBasedOptimizer(DistributedOptimizer):
"""
def
__init__
(
self
,
optimizer
,
strategy
=
None
):
assert
isinstance
(
strategy
,
DistributedStrategy
),
"strategy must be DistributedStrategy"
super
(
CollectiveOpBasedOptimizer
,
self
).
__init__
(
optimizer
,
strategy
)
def
backward
(
self
,
...
...
@@ -149,69 +154,6 @@ class CollectiveOpBasedOptimizer(DistributedOptimizer):
return
self
.
_optimizer
.
apply_gradients
(
params_grads
)
class
FP16SGDOptimizer
(
CollectiveOpBasedOptimizer
):
"""
do all reduce within every minibatch
"""
def
__init__
(
self
,
optimizer
,
strategy
=
None
):
super
(
FP16SGDOptimizer
,
self
).
__init__
(
optimizer
,
strategy
)
def
minimize
(
self
,
loss
,
startup_program
=
None
,
parameter_list
=
None
,
no_grad_set
=
None
):
pass
class
FP32LocalSGDOptimizer
(
CollectiveOpBasedOptimizer
):
def
__init__
(
self
,
optimizer
,
strategy
=
None
):
super
(
FP32LocalSGDOptimizer
,
self
).
__init__
(
optimizer
,
strategy
)
def
minimize
(
self
,
loss
,
startup_program
=
None
,
parameter_list
=
None
,
no_grad_set
=
None
):
opts
,
param_and_grads
=
self
.
_optimizer
.
minimize
(
loss
)
config
=
fluid
.
DistributeTranspilerConfig
()
config
.
mode
=
'collective'
config
.
collective_mode
=
'local_sgd'
t
=
fluid
.
DistributeTranspiler
(
config
=
config
)
t
.
transpile
(
trainer_id
=
fleet
.
worker_index
(),
trainers
=
fleet
.
worker_endpoints
(),
current_endpoint
=
fleet
.
worker_endpoints
()[
fleet
.
worker_index
()],
startup_program
=
startup_program
,
program
=
loss
.
block
.
program
)
return
opts
,
param_and_grads
class
FP32SGDOptimizer
(
CollectiveOpBasedOptimizer
):
def
__init__
(
self
,
optimizer
,
strategy
=
None
):
super
(
FP32SGDOptimizer
,
self
).
__init__
(
optimizer
,
strategy
)
def
minimize
(
self
,
loss
,
startup_program
=
None
,
parameter_list
=
None
,
no_grad_set
=
None
):
opts
,
param_and_grads
=
self
.
_optimizer
.
minimize
(
loss
)
config
=
fluid
.
DistributeTranspilerConfig
()
config
.
mode
=
'collective'
config
.
collective_mode
=
'grad_allreduce'
t
=
fluid
.
DistributeTranspiler
(
config
=
config
)
t
.
transpile
(
trainer_id
=
fleet
.
worker_index
(),
trainers
=
fleet
.
worker_endpoints
(),
current_endpoint
=
fleet
.
worker_endpoints
()[
fleet
.
worker_index
()],
startup_program
=
startup_program
,
program
=
loss
.
block
.
program
)
return
opts
,
param_and_grads
class
CollectiveOptimizer
(
DistributedOptimizer
):
"""
DistributedOptimizer is a wrapper for paddle.fluid.optimizer
...
...
@@ -223,9 +165,9 @@ class CollectiveOptimizer(DistributedOptimizer):
training.
"""
def
__init__
(
self
,
optimizer
,
strategy
=
None
):
def
__init__
(
self
,
optimizer
,
strategy
=
DistributedStrategy
()
):
super
(
CollectiveOptimizer
,
self
).
__init__
(
optimizer
,
strategy
)
self
.
strategy
=
strategy
self
.
print_config
=
False
def
backward
(
self
,
loss
,
...
...
@@ -239,6 +181,95 @@ class CollectiveOptimizer(DistributedOptimizer):
def
apply_gradients
(
self
,
params_grads
):
return
self
.
_optimizer
.
apply_gradients
(
params_grads
)
def
_check_condition
(
self
,
name
,
**
kwargs
):
for
k
,
v
in
kwargs
.
iterms
():
if
v
is
True
:
assert
False
,
"you can't use %s and %s together"
%
(
name
,
k
)
def
_check_collective_mode
(
self
,
main_program
,
optimizer
,
strategy
):
"""
Check the conflict condtions.
"""
if
strategy
.
use_local_sgd
:
self
.
_check_condition
(
"use_local_sgd"
,
use_dgc
=
main_program
.
_enable_dgc
,
use_dist_fc
=
strategy
.
use_dist_fc
,
use_lamb
=
main_program
.
_use_lamb
)
assert
strategy
.
local_sgd_config
is
not
None
,
"DistributedStrategy.local_sgd_config should be set"
if
strategy
.
use_dist_fc
:
self
.
_check_condition
(
"use_dist_fc"
,
use_dgc
=
main_program
.
_enable_dgc
,
use_local_sgd
=
strategy
.
use_local_sgd
,
use_lamb
=
main_program
.
_use_lamb
)
assert
strategy
.
dist_fc_config
is
not
None
,
"DistributedStrategy.dist_fc_config should be set"
if
self
.
_strategy
.
collective_mode
==
"local_sgd"
\
or
self
.
_strategy
.
collective_mode
==
"grad_allreduce"
:
assert
self
.
_strategy
.
mode
==
"collective"
,
\
"local_sgd and grad_allreduce can be used under collective mode"
def
_transpile
(
self
,
startup_program
,
main_program
):
"""
Transpile the programs to distributed programs. And add the variables.
"""
if
self
.
_strategy
.
fuse_all_reduce_ops
:
os
.
environ
[
'FLAGS_fuse_parameter_memory_size'
]
=
self
.
fuse_memory_size
os
.
environ
[
'FLAGS_fuse_parameter_groups_size'
]
=
self
.
fuse_layer_size
worker_endpoints
=
fleet
.
worker_endpoints
()
trainer_id
=
fleet
.
worker_index
()
current_endpoint
=
fleet
.
worker_endpoints
()[
trainer_id
]
worker_endpoints_env
=
','
.
join
(
worker_endpoints
)
trainers_num
=
fleet
.
worker_num
()
if
self
.
print_config
:
print
(
"worker_endpoints:{} trainers_num:{} current_endpoint:{}
\
trainer_id:{}"
.
format
(
worker_endpoints
,
trainers_num
,
current_endpoint
,
trainer_id
))
# call transpiler
config
=
dist_transpiler
.
DistributeTranspilerConfig
()
config
.
mode
=
self
.
_strategy
.
mode
config
.
collective_mode
=
self
.
_strategy
.
collective_mode
config
.
nccl_comm_num
=
self
.
_strategy
.
nccl_comm_num
config
.
use_hierarchical_allreduce
=
self
.
_strategy
.
use_hierarchical_allreduce
config
.
hierarchical_allreduce_inter_nranks
=
self
.
_strategy
.
hierarchical_allreduce_inter_nranks
t
=
dist_transpiler
.
DistributeTranspiler
(
config
=
config
)
t
.
transpile
(
trainer_id
=
trainer_id
,
trainers
=
worker_endpoints_env
,
startup_program
=
startup_program
,
program
=
main_program
,
current_endpoint
=
current_endpoint
)
def
_try_to_compile
(
self
,
startup_program
,
main_program
):
self
.
_transpile
(
startup_program
,
main_program
)
if
self
.
_strategy
.
mode
==
"collective"
:
return
main_program
self
.
_strategy
.
num_trainers
=
fleet
.
worker_num
()
self
.
_strategy
.
trainer_id
=
fleet
.
worker_index
()
self
.
_strategy
.
trainers_endpoints
=
fleet
.
worker_endpoints
()
self
.
_strategy
.
enable_backward_optimizer_op_deps
=
True
self
.
_compiled_program
=
compiler
.
CompiledProgram
(
main_program
)
self
.
_compiled_program
.
with_data_parallel
(
loss_name
=
self
.
_loss
.
name
,
build_strategy
=
self
.
_strategy
,
exec_strategy
=
self
.
_strategy
.
exec_strategy
,
share_vars_from
=
None
)
return
self
.
_compiled_program
def
minimize
(
self
,
loss
,
startup_program
=
None
,
...
...
@@ -260,24 +291,20 @@ class CollectiveOptimizer(DistributedOptimizer):
process, but currently the optimization part is written into Fleet(). A user does not
need to care about how to startup a pserver node.
"""
optimize_ops
,
param_grads
=
self
.
_optimizer
.
minimize
(
loss
,
startup_program
,
parameter_list
,
no_grad_set
)
main_program
=
loss
.
block
.
program
if
startup_program
is
None
:
startup_program
=
fluid
.
default_startup_program
()
fleet
.
startup_program
=
startup_program
worker_endpoints
=
fleet
.
worker_endpoints
()
trainer_id
=
fleet
.
worker_index
()
current_endpoint
=
fleet
.
worker_endpoints
()[
trainer_id
]
self
.
_loss
=
loss
s
tartup_program
=
startup_program
if
startup_program
else
\
fluid
.
framework
.
default_startup_program
s
elf
.
_check_collective_mode
(
main_program
,
self
.
_optimizer
,
self
.
_strategy
)
# call transpiler
config
=
dist_transpiler
.
DistributeTranspilerConfig
()
config
.
mode
=
"nccl2"
t
=
dist_transpiler
.
DistributeTranspiler
(
config
=
config
)
t
.
transpile
(
trainer_id
,
trainers
=
','
.
join
(
worker_endpoints
),
startup_program
=
startup_program
,
current_endpoint
=
current_endpoint
)
optimize_ops
,
param_grads
=
self
.
_optimizer
.
minimize
(
loss
,
startup_program
,
parameter_list
,
no_grad_set
)
fleet
.
_origin_program
=
main_program
fleet
.
main_program
=
self
.
_try_to_compile
(
startup_program
,
main_program
)
return
optimize_ops
,
param_grads
python/paddle/fluid/incubate/fleet/parameter_server/distribute_transpiler/__init__.py
浏览文件 @
29d87812
...
...
@@ -239,6 +239,14 @@ class DistributedTranspiler(Fleet):
self
.
main_program
,
self
.
startup_program
=
\
self
.
_transpiler
.
get_pserver_programs
(
self
.
server_endpoints
()[
self
.
server_index
()])
def
node_num
(
self
):
logging
.
warn
(
"You should not call 'node_num' method for collective mode."
)
def
node_id
(
self
):
logging
.
warn
(
"You should not call 'node_id' method for collective mode."
)
fleet
=
DistributedTranspiler
()
...
...
python/paddle/fluid/optimizer.py
浏览文件 @
29d87812
...
...
@@ -2176,6 +2176,7 @@ class LambOptimizer(AdamOptimizer):
def
_append_optimize_op
(
self
,
block
,
param_and_grad
):
assert
isinstance
(
block
,
framework
.
Block
)
block
.
program
.
_use_lamb
=
True
moment1
=
self
.
_get_accumulator
(
self
.
_moment1_acc_str
,
param_and_grad
[
0
])
...
...
python/paddle/fluid/tests/unittests/CMakeLists.txt
浏览文件 @
29d87812
...
...
@@ -8,6 +8,7 @@ if(NOT WITH_DISTRIBUTE)
list
(
REMOVE_ITEM TEST_OPS test_simple_dist_transpiler
)
list
(
REMOVE_ITEM TEST_OPS test_listen_and_serv_op
)
LIST
(
REMOVE_ITEM TEST_OPS test_dist_mnist
)
LIST
(
REMOVE_ITEM TEST_OPS test_dist_mnist_fleetapi
)
LIST
(
REMOVE_ITEM TEST_OPS test_dist_mnist_dgc_nccl
)
LIST
(
REMOVE_ITEM TEST_OPS test_dist_mnist_hallreduce
)
LIST
(
REMOVE_ITEM TEST_OPS test_dist_mnist_multi_comm
)
...
...
@@ -236,29 +237,32 @@ if(WITH_DISTRIBUTE)
if
(
NOT APPLE
)
set_tests_properties
(
test_dist_mnist PROPERTIES TIMEOUT 350 LABELS
"RUN_TYPE=EXCLUSIVE"
)
set_tests_properties
(
test_dist_mnist_dgc_nccl PROPERTIES TIMEOUT 350 LABELS
"RUN_TYPE=EXCLUSIVE"
)
set_tests_properties
(
test_dist_mnist_hallreduce PROPERTIES TIMEOUT 350 LABELS
"RUN_TYPE=EXCLUSIVE"
)
set_tests_properties
(
test_dist_mnist_multi_comm PROPERTIES TIMEOUT 350 LABELS
"RUN_TYPE=EXCLUSIVE"
)
set_tests_properties
(
test_dist_mnist_ring_allreduce PROPERTIES TIMEOUT 350 LABELS
"RUN_TYPE=EXCLUSIVE"
)
set_tests_properties
(
test_dist_mnist_backward_deps PROPERTIES TIMEOUT 350 LABELS
"RUN_TYPE=EXCLUSIVE"
)
set_tests_properties
(
test_dist_mnist_hallreduce PROPERTIES TIMEOUT 350 LABELS
"RUN_TYPE=EXCLUSIVE"
)
set_tests_properties
(
test_dist_mnist_multi_comm PROPERTIES TIMEOUT 350 LABELS
"RUN_TYPE=EXCLUSIVE"
)
set_tests_properties
(
test_dist_mnist_ring_allreduce PROPERTIES TIMEOUT 350 LABELS
"RUN_TYPE=EXCLUSIVE"
)
set_tests_properties
(
test_dist_mnist_backward_deps PROPERTIES TIMEOUT 350 LABELS
"RUN_TYPE=EXCLUSIVE"
)
set_tests_properties
(
test_dist_mnist_fleetapi PROPERTIES TIMEOUT 350 LABELS
"RUN_TYPE=EXCLUSIVE"
)
set_tests_properties
(
test_dist_mnist_lars PROPERTIES TIMEOUT 350 LABELS
"RUN_TYPE=EXCLUSIVE"
)
set_tests_properties
(
test_dist_word2vec PROPERTIES TIMEOUT 350 LABELS
"RUN_TYPE=EXCLUSIVE"
)
set_tests_properties
(
test_dist_simnet_bow PROPERTIES TIMEOUT 350 LABELS
"RUN_TYPE=EXCLUSIVE"
)
set_tests_properties
(
test_dist_text_classification PROPERTIES TIMEOUT 350 LABELS
"RUN_TYPE=EXCLUSIVE"
)
set_tests_properties
(
test_dist_simnet_bow PROPERTIES TIMEOUT 350 LABELS
"RUN_TYPE=EXCLUSIVE"
)
set_tests_properties
(
test_dist_text_classification PROPERTIES TIMEOUT 350 LABELS
"RUN_TYPE=EXCLUSIVE"
)
list
(
REMOVE_ITEM TEST_OPS test_dist_se_resnext_dgc
)
list
(
REMOVE_ITEM TEST_OPS test_dist_se_resnext_dgc
)
list
(
REMOVE_ITEM TEST_OPS test_dist_se_resnext_sync
)
list
(
REMOVE_ITEM TEST_OPS test_dist_se_resnext_async
)
list
(
REMOVE_ITEM TEST_OPS test_dist_se_resnext_sync_with_memopt
)
list
(
REMOVE_ITEM TEST_OPS test_dist_se_resnext_sync_with_memopt
)
py_test_modules
(
test_dist_se_resnext_dgc MODULES test_dist_se_resnext_dgc
)
py_test_modules
(
test_dist_se_resnext_sync MODULES test_dist_se_resnext_sync
)
py_test_modules
(
test_dist_se_resnext_sync MODULES test_dist_se_resnext_sync
)
py_test_modules
(
test_dist_se_resnext_nccl MODULES test_dist_se_resnext_nccl
)
bash_test_modules
(
test_launch MODULES test_launch.sh
)
# FIXME(typhoonzero): add these tests back
# py_test_modules(test_dist_transformer MODULES test_dist_transformer)
# set_tests_properties(test_dist_transformer PROPERTIES TIMEOUT 1000)
set_tests_properties
(
test_dist_se_resnext_dgc PROPERTIES LABELS
"RUN_TYPE=EXCLUSIVE"
)
set_tests_properties
(
test_dist_se_resnext_sync PROPERTIES LABELS
"RUN_TYPE=EXCLUSIVE"
)
set_tests_properties
(
test_dist_se_resnext_nccl PROPERTIES LABELS
"RUN_TYPE=EXCLUSIVE"
)
set_tests_properties
(
test_dist_se_resnext_dgc PROPERTIES LABELS
"RUN_TYPE=EXCLUSIVE"
)
set_tests_properties
(
test_dist_se_resnext_sync PROPERTIES LABELS
"RUN_TYPE=EXCLUSIVE"
)
set_tests_properties
(
test_dist_se_resnext_nccl PROPERTIES LABELS
"RUN_TYPE=EXCLUSIVE"
)
endif
(
NOT APPLE
)
# py_test_modules(test_dist_transpiler MODULES test_dist_transpiler)
endif
()
...
...
python/paddle/fluid/tests/unittests/dist_mnist.py
浏览文件 @
29d87812
...
...
@@ -29,6 +29,7 @@ import os
import
signal
from
functools
import
reduce
from
test_dist_base
import
TestDistRunnerBase
,
runtime_main
from
paddle.fluid.incubate.fleet.collective
import
fleet
,
DistributedStrategy
DTYPE
=
"float32"
paddle
.
dataset
.
mnist
.
fetch
()
...
...
@@ -73,7 +74,7 @@ def cnn_model(data):
class
TestDistMnist2x2
(
TestDistRunnerBase
):
def
get_model
(
self
,
batch_size
=
2
,
use_dgc
=
False
):
def
get_model
(
self
,
batch_size
=
2
,
use_dgc
=
False
,
dist_strategy
=
None
):
# Input data
images
=
fluid
.
layers
.
data
(
name
=
'pixel'
,
shape
=
[
1
,
28
,
28
],
dtype
=
DTYPE
)
label
=
fluid
.
layers
.
data
(
name
=
'label'
,
shape
=
[
1
],
dtype
=
'int64'
)
...
...
@@ -104,7 +105,14 @@ class TestDistMnist2x2(TestDistRunnerBase):
paddle
.
dataset
.
mnist
.
test
(),
batch_size
=
batch_size
)
test_reader
=
paddle
.
batch
(
paddle
.
dataset
.
mnist
.
test
(),
batch_size
=
batch_size
)
opt
.
minimize
(
avg_cost
)
if
dist_strategy
:
dist_opt
=
fleet
.
distributed_optimizer
(
optimizer
=
opt
,
strategy
=
dist_strategy
)
_
,
param_grads
=
dist_opt
.
minimize
(
avg_cost
)
else
:
opt
.
minimize
(
avg_cost
)
return
inference_program
,
avg_cost
,
train_reader
,
test_reader
,
batch_acc
,
predict
...
...
python/paddle/fluid/tests/unittests/test_dist_base.py
浏览文件 @
29d87812
...
...
@@ -31,6 +31,9 @@ import paddle.fluid.dygraph as dygraph
from
paddle.fluid.dygraph.base
import
to_variable
from
paddle.fluid.dygraph.parallel
import
DataParallel
from
paddle.fluid.incubate.fleet.collective
import
fleet
,
DistributedStrategy
import
paddle.fluid.incubate.fleet.base.role_maker
as
role_maker
RUN_STEP
=
5
DEFAULT_BATCH_SIZE
=
2
...
...
@@ -44,6 +47,10 @@ def my_print(class_name, log_str):
sys
.
stderr
.
buffer
.
write
(
pickle
.
dumps
(
print_str
))
def
eprint
(
*
args
,
**
kwargs
):
print
(
*
args
,
file
=
sys
.
stderr
,
**
kwargs
)
class
TestDistRunnerBase
(
object
):
def
get_model
(
self
,
batch_size
=
DEFAULT_BATCH_SIZE
,
...
...
@@ -96,6 +103,72 @@ class TestDistRunnerBase(object):
exe
.
run
(
pserver_prog
)
my_print
(
type
(
self
).
__name__
,
"run pserver main program done."
)
def
run_gpu_fleet_api_trainer
(
self
,
args
):
assert
args
.
update_method
==
"nccl2"
self
.
lr
=
args
.
lr
exec_strategy
=
fluid
.
ExecutionStrategy
()
exec_strategy
.
num_threads
=
1
dist_strategy
=
DistributedStrategy
()
dist_strategy
.
exec_strategy
=
exec_strategy
dist_strategy
.
fuse_memory_size
=
1
#MB
dist_strategy
.
fuse_laryer_size
=
1
role
=
role_maker
.
PaddleCloudRoleMaker
(
is_collective
=
True
)
fleet
.
init
(
role
)
my_print
(
"gpu_fleet"
,
"fleet.node_num:"
)
#"fleet.node_id:", fleet.node_id(),
#"fleet.trainer_num:", fleet.worker_num())
test_program
,
avg_cost
,
train_reader
,
test_reader
,
batch_acc
,
predict
=
\
self
.
get_model
(
batch_size
=
args
.
batch_size
,
dist_strategy
=
dist_strategy
)
trainer_prog
=
fleet
.
_origin_program
dist_prog
=
fleet
.
main_program
device_id
=
int
(
os
.
getenv
(
"FLAGS_selected_gpus"
,
"0"
))
place
=
fluid
.
CUDAPlace
(
device_id
)
exe
=
fluid
.
Executor
(
place
)
exe
.
run
(
fluid
.
default_startup_program
())
eprint
(
type
(
self
).
__name__
,
"run worker startup program done."
)
feed_var_list
=
[
var
for
var
in
trainer_prog
.
global_block
().
vars
.
values
()
if
var
.
is_data
]
feeder
=
fluid
.
DataFeeder
(
feed_var_list
,
place
)
reader_generator
=
train_reader
()
def
get_data
():
origin_batch
=
next
(
reader_generator
)
if
args
.
update_method
!=
"local"
and
args
.
use_reader_alloc
:
new_batch
=
[]
for
offset
,
item
in
enumerate
(
origin_batch
):
if
offset
%
2
==
args
.
trainer_id
:
new_batch
.
append
(
item
)
return
new_batch
else
:
return
origin_batch
my_print
(
type
(
self
).
__name__
,
"begin to train on trainer"
)
out_losses
=
[]
for
i
in
six
.
moves
.
xrange
(
RUN_STEP
):
loss
,
=
exe
.
run
(
dist_prog
,
fetch_list
=
[
avg_cost
.
name
],
feed
=
feeder
.
feed
(
get_data
()))
out_losses
.
append
(
loss
[
0
])
my_print
(
type
(
self
).
__name__
,
"run step %d finished"
%
i
)
my_print
(
type
(
self
).
__name__
,
"trainer run finished"
)
if
six
.
PY2
:
print
(
pickle
.
dumps
(
out_losses
))
else
:
sys
.
stdout
.
buffer
.
write
(
pickle
.
dumps
(
out_losses
))
def
run_trainer
(
self
,
args
):
self
.
lr
=
args
.
lr
if
args
.
nccl2_reduce_layer_local_run
:
...
...
@@ -318,6 +391,7 @@ def runtime_main(test_class):
parser
.
add_argument
(
'--nccl_comm_num'
,
type
=
int
,
required
=
False
,
default
=
1
)
parser
.
add_argument
(
'--enable_backward_deps'
,
action
=
'store_true'
)
parser
.
add_argument
(
'--use_hallreduce'
,
action
=
'store_true'
)
parser
.
add_argument
(
'--gpu_fleet_api'
,
action
=
'store_true'
)
parser
.
add_argument
(
'--hallreduce_inter_nranks'
,
type
=
int
,
required
=
False
,
default
=
2
)
parser
.
add_argument
(
...
...
@@ -344,6 +418,8 @@ def runtime_main(test_class):
model
=
test_class
()
if
args
.
role
==
"pserver"
and
args
.
update_method
==
"pserver"
:
model
.
run_pserver
(
args
)
elif
args
.
gpu_fleet_api
:
model
.
run_gpu_fleet_api_trainer
(
args
)
else
:
model
.
run_trainer
(
args
)
...
...
@@ -397,6 +473,7 @@ class TestDistBase(unittest.TestCase):
self
.
_dygraph
=
False
self
.
_nccl_comm_num
=
1
self
.
_enable_backward_deps
=
False
self
.
_gpu_fleet_api
=
False
self
.
_use_hallreduce
=
False
self
.
_setup_config
()
self
.
_after_setup_config
()
...
...
@@ -600,7 +677,9 @@ class TestDistBase(unittest.TestCase):
env
.
update
({
"CUDA_VISIBLE_DEVICES"
:
"{}"
.
format
(
trainer_id
),
"PADDLE_TRAINERS_NUM"
:
"{}"
.
format
(
trainer_num
),
"PADDLE_TRAINER_ID"
:
"{}"
.
format
(
trainer_id
)
"PADDLE_TRAINER_ID"
:
"{}"
.
format
(
trainer_id
),
"PADDLE_TRAINER_ENDPOINTS"
:
self
.
_ps_endpoints
,
"PADDLE_CURRENT_ENDPOINT"
:
ep
,
})
else
:
env
.
update
({
'CPU_NUM'
:
'1'
})
...
...
@@ -620,6 +699,9 @@ class TestDistBase(unittest.TestCase):
if
self
.
_enable_backward_deps
:
tr_cmd
+=
" --enable_backward_deps"
if
self
.
_gpu_fleet_api
:
tr_cmd
+=
" --gpu_fleet_api"
return
tr_cmd
,
env
def
_run_cluster_nccl2
(
self
,
model
,
envs
,
nccl2_reduce_layer
,
...
...
@@ -669,6 +751,9 @@ class TestDistBase(unittest.TestCase):
pipes
[
i
].
close
()
sys
.
stderr
.
write
(
'trainer {} stderr: {}
\n
'
.
format
(
i
,
tr_err
))
if
check_error_log
:
print
(
"outs[0]:"
,
outs
[
0
])
print
(
"outs[1]:"
,
outs
[
1
])
return
pickle
.
loads
(
outs
[
0
]),
pickle
.
loads
(
outs
[
1
])
def
check_with_place
(
self
,
...
...
python/paddle/fluid/tests/unittests/test_dist_mnist_fleetapi.py
0 → 100644
浏览文件 @
29d87812
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
unittest
from
test_dist_base
import
TestDistBase
class
TestDistMnistNCCL2FleetApi
(
TestDistBase
):
def
_setup_config
(
self
):
self
.
_sync_mode
=
True
self
.
_use_reduce
=
False
self
.
_use_reader_alloc
=
False
self
.
_nccl2_mode
=
True
self
.
_gpu_fleet_api
=
True
def
test_dist_train
(
self
):
import
paddle.fluid
as
fluid
if
fluid
.
core
.
is_compiled_with_cuda
():
self
.
check_with_place
(
"dist_mnist.py"
,
delta
=
1e-5
)
if
__name__
==
"__main__"
:
unittest
.
main
()
python/paddle/fluid/transpiler/distribute_transpiler.py
浏览文件 @
29d87812
...
...
@@ -174,7 +174,7 @@ class DistributeTranspilerConfig(object):
hierarchical_allreduce_inter_nranks
=
0
# if mode is collective
# supported modes:
sgd
, local_sgd
# supported modes:
grad_allreduce
, local_sgd
collective_mode
=
None
...
...
@@ -431,7 +431,7 @@ class DistributeTranspiler(object):
trainers_num
=
len
(
self
.
origin_program
.
_trainers_endpoints
)
# selected automaticly
if
self
.
config
.
hierarchical_allreduce_inter_nranks
<=
1
:
self
.
config
.
hierarchical_allreduce_inter_nranks
=
fluid
.
core
.
get_cuda_device_count
(
self
.
config
.
hierarchical_allreduce_inter_nranks
=
core
.
get_cuda_device_count
(
)
assert
trainers_num
>
self
.
config
.
hierarchical_allreduce_inter_nranks
,
\
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录